In [2]:
import pandas as pd 
import numpy as np

In [3]:
#path to the directory containing json files
datasetpath = 'data/massive_dataset/data/'

In [4]:
import os
#library to resolve paths in the working directories

# List all JSONL files in the directory
jsonl_files = [f for f in os.listdir(datasetpath) if f.endswith('.jsonl')]

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through each JSONL file and read its contents into a DataFrame
for jsonl_file in jsonl_files:
    try:
        # Construct the full file path
        jsonl_file_path = os.path.join(datasetpath, jsonl_file)

        # Read the JSONL file into a DataFrame, assuming one JSON object per line
        df = pd.read_json(jsonl_file_path, lines=True)
        
        # Append the DataFrame to the list
        dataframes.append(df)
    except Exception as e:
        print(f"Error processing {jsonl_file}: {e}")

# Check if any DataFrames were created
if dataframes:
    # Concatenate all DataFrames into a single DataFrame
    combined_dataframe = pd.concat(dataframes, ignore_index=True)
    
    # Now, combined_dataframe contains data from all JSONL files as a single DataFrame
    print("Data from all JSONL files has been loaded.")
else:
    print("No valid data found in the JSONL files.")




Data from all JSONL files has been loaded.


In [5]:
combined_dataframe.describe()

Unnamed: 0,id,worker_id
count,859092.0,859092.0
mean,8564.718661,24.415974
std,4955.57753,53.467818
min,0.0,0.0
25%,4273.0,6.0
50%,8534.0,15.0
75%,12837.0,28.0
max,17180.0,690.0


In [6]:
# Get the unique languages in the DataFrame
unique_languages = combined_dataframe['locale'].unique()

# Specify the directory for generated Excel files
excel_output_directory = 'data/generated_excel_files_all/'

# Create the output directory if it doesn't exist
os.makedirs(excel_output_directory, exist_ok=True)
    
# Filter and save data for each language in separate Excel files
for lang in unique_languages:
    # Filter data for the current language
    lang_data = combined_dataframe[combined_dataframe['locale'] == lang]
    
   # Create a unique Excel filename for each locale, e.g., en-xx.xlsx
    excel_filename = os.path.join(excel_output_directory, f'en-{lang}.xlsx')
    
    # Save the filtered data to the Excel file
    lang_data.to_excel(excel_filename, index=False)
    print(f'Data for language {lang} has been saved to {excel_filename}')


Data for language af-ZA has been saved to data/generated_excel_files_all/en-af-ZA.xlsx
Data for language am-ET has been saved to data/generated_excel_files_all/en-am-ET.xlsx
Data for language ar-SA has been saved to data/generated_excel_files_all/en-ar-SA.xlsx
Data for language az-AZ has been saved to data/generated_excel_files_all/en-az-AZ.xlsx
Data for language bn-BD has been saved to data/generated_excel_files_all/en-bn-BD.xlsx
Data for language ca-ES has been saved to data/generated_excel_files_all/en-ca-ES.xlsx
Data for language cy-GB has been saved to data/generated_excel_files_all/en-cy-GB.xlsx
Data for language da-DK has been saved to data/generated_excel_files_all/en-da-DK.xlsx
Data for language de-DE has been saved to data/generated_excel_files_all/en-de-DE.xlsx
Data for language el-GR has been saved to data/generated_excel_files_all/en-el-GR.xlsx
Data for language en-US has been saved to data/generated_excel_files_all/en-en-US.xlsx
Data for language es-ES has been saved to d

In [7]:
from sklearn.model_selection import train_test_split

# Specify the selected locales
selected_languages = ["en-US", "sw-KE", "de-DE"]
print("Selected Languages:", selected_languages)
print("Unique Languages:", unique_languages)


# Initialize dictionaries to store train, validation, and test datasets for each locale
train_datasets = {}
valid_datasets = {}
test_datasets = {}

# Extract, shuffle, and partition data for each locale

for lang in unique_languages:
    # Filter data for the current language
    lang_data = combined_dataframe[combined_dataframe['locale'] == lang]
    
    # Shuffle data only for selected locales ("en," "sw," and "de")
    if lang in selected_languages:
        shuffled_data = lang_data.sample(frac=1)  # Shuffle the data
        if len(shuffled_data) > 10:
            print('emptyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy')
            
        print(f"Length of shuffled data for Locale {lang}: {len(shuffled_data)}")
        
        # Split the shuffled data into train, validation, and test datasets
        train_data, valid_test_data = train_test_split(shuffled_data, test_size=0.4, random_state=42)
        valid_data, test_data = train_test_split(valid_test_data, test_size=0.5, random_state=42)

        # Store the datasets in dictionaries
        train_datasets[lang] = train_data
        valid_datasets[lang] = valid_data
        test_datasets[lang] = test_data
        print(f"Length of train data for Locale {lang}: {len(train_datasets[lang])}")
        



Selected Languages: ['en-US', 'sw-KE', 'de-DE']
Unique Languages: ['af-ZA' 'am-ET' 'ar-SA' 'az-AZ' 'bn-BD' 'ca-ES' 'cy-GB' 'da-DK' 'de-DE'
 'el-GR' 'en-US' 'es-ES' 'fa-IR' 'fi-FI' 'fr-FR' 'he-IL' 'hi-IN' 'hu-HU'
 'hy-AM' 'id-ID' 'is-IS' 'it-IT' 'ja-JP' 'jv-ID' 'ka-GE' 'km-KH' 'kn-IN'
 'ko-KR' 'lv-LV' 'ml-IN' 'mn-MN' 'ms-MY' 'my-MM' 'nb-NO' 'nl-NL' 'pl-PL'
 'pt-PT' 'ro-RO' 'ru-RU' 'sl-SL' 'sq-AL' 'sv-SE' 'sw-KE' 'ta-IN' 'te-IN'
 'th-TH' 'tl-PH' 'tr-TR' 'ur-PK' 'vi-VN' 'zh-CN' 'zh-TW']
dooooooooneeeeeeeeeeeeeeeeeeeeeeeeee
emptyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy
Length of shuffled data for Locale de-DE: 16521
Length of train data for Locale de-DE: 9912
emptyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy
Length of shuffled data for Locale en-US: 16521
Length of train data for Locale en-US: 9912
emptyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy
Length of shuffled data for Locale sw-KE: 16521
Length of train data for Locale sw-KE: 9912


In [50]:
import json

# Specify the directory where you want to save the JSONL files
output_directory = 'data/generated_jsonl/'  # Replace with the actual path

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Iterate over the selected locales
for lang in selected_languages:
    # Define the filenames for the JSONL files
    train_filename = os.path.join(output_directory, f'{lang}_train.jsonl')
    valid_filename = os.path.join(output_directory, f'{lang}_valid.jsonl')
    test_filename = os.path.join(output_directory, f'{lang}_test.jsonl')

    # Write the train data to the JSONL file
    with open(train_filename, 'w', encoding='utf-8') as train_file:
        for index, row in train_datasets[lang].iterrows():
            json.dump(row.to_dict(), train_file, ensure_ascii=False)
            train_file.write('\n')

    # Write the validation data to the JSONL file
    with open(valid_filename, 'w', encoding='utf-8') as valid_file:
        for index, row in valid_datasets[lang].iterrows():
            json.dump(row.to_dict(), valid_file, ensure_ascii=False)
            valid_file.write('\n')

    # Write the test data to the JSONL file
    with open(test_filename, 'w', encoding='utf-8') as test_file:
        for index, row in test_datasets[lang].iterrows():
            json.dump(row.to_dict(), test_file, ensure_ascii=False)
            test_file.write('\n')


In [8]:
import json
import os

# Initialize an empty list to store the translations
translations = []

# Specify the output JSON file name and directory
output_directory = 'data/translations_jsonl/'  # Specify the directory
output_json_file = os.path.join(output_directory, 'translations.json')

# Iterate over the selected locales
for lang in selected_languages:
    # Filter the training data for the current locale
    train_data = train_datasets[lang]
    
    # Extract translations from English (en) to the current locale (xx)
    translations_lang = {
        'from_language': 'en',
        'to_language': lang,
        'translations': train_data[['id', 'utt']].to_dict(orient='records')
    }
    
    # Append the translations to the list
    translations.append(translations_lang)

# Create a dictionary to store all translations
all_translations = {
    'translations': translations
}

# Write the translations to the output JSON file with pretty formatting
with open(output_json_file, 'w', encoding='utf-8') as json_file:
    json.dump(all_translations, json_file, ensure_ascii=False, indent=4)

print(f"Translations have been saved to {output_json_file}")


Translations have been saved to data/translations_jsonl/translations.json
