The code to extract parallel corpus from Wiktionary has been solved and output file has been saved correctly.

In [1]:
import json
import csv
import pandas as pd
file_path = './data/kaikki.org-dictionary-English.json'

In [None]:
# Specify the 13 languages (including Indonesian) to extract translations for
# Chinese contains russian translations, switch to Mandarin Chinese
# {"lang": "Chinese", "code": "dng", "tags": ["Dungan"], 
#"sense": "time period of sixty minutes", "roman": "sahatɨ", "word": "сахаты", "_dis1": "59 10 19 8 2 1"}
# Swicth Croatian to Serbo-Croatian, translation empty
# languages = ['Indonesian', 'Arabic', 'Mandarin Chinese', 'Greek', 'English', 'Persian', 'Finnish', 'Spanish', 'Japanese', 'Serbo-Croatian', 'Polish', 'Slovene', 'Thai']

languages = ['id', 'ar', 'cmn', 'el', 'en', 'fa', 'fi', 'es', 'ja', 'sh', 'pl', 'sl', 'th']

# Create a dictionary to store the translations for each language
translations = {}
for language in languages:
    translations[language] = {}

# Load the JSON data
with open(file_path, encoding='utf-8') as f:
    data = [json.loads(line) for line in f]
    print('JSON file is loaded successfully')


In [None]:
# extract the translations for each language
for entry in data:
    senses = entry.get('senses', [])
    for sense in senses:
        for language in languages:
            translations_list = sense.get('translations', [])
            for translation in translations_list:
                if translation.get('lang') == language:
                    concept = entry['word']
                    form = translation.get('word')
                    if form is not None:
                        if concept not in translations[language]:
                            translations[language][concept] = [form]
                        else:
                            translations[language][concept].append(form)

# print the translations for each language
for language in languages:
    print(f'Translations for {language}')
    for concept in translations[language]:
        forms = ', '.join(translations[language][concept])
        # print(f'{concept}: {forms}')
    print(f'Finished collecting translations for {language}')


In [None]:
import pandas as pd

output_path = './data/translations_output_wiktionary.tsv'

# Write the translations to a TSV file
with open(output_path, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerow(['Indonesian'] + languages[1:])  # Use Indonesian as the first row

    for concept in translations[languages[0]]:
        row_data = []
        for language in languages:
            translations_list = translations[language].get(concept, ['None'])
            if language == 'English':
                row_data.append(concept)
            else:
                row_data.append(', '.join(translations_list))
        writer.writerow(row_data)
        print(row_data)

print(f'Translations saved to {output_path}')

# Read the TSV file as a dataframe
df = pd.read_csv(output_path, delimiter='\t')

# Print the dataframe
print(df)


Data cleaning:
1. Checking the language ID
2. Mapping the Concept as English translations 
3. Checking if all words are translated correctly, for example, in the JSON kaikki file, translation for language 'Chinese' has some russian words such as "太陽, 太阳, 日, эрту, жәту, тэён, 日頭, 日头, 太陽, 太阳, 日頭, 日头, 太陽, 太阳, 太陽, 太阳, 日頭, 日头, 太陽, 太阳, 日頭, 日头, 熱頭, 热头, эрту, жәту, 日頭, 日头, 太陽, 太阳, 日頭, 日头, 太陽, 太阳, 太陽, 太阳, 日頭, 日头, 曝" for the translation "matahari, matahari, surya" so instead of using Chinese we should use Mandarin Chinese. Alos, Croation isn't available in the JSON file and have to be switched to Serbo-Croatian

In [None]:
#Final code with lang_code
import json
import csv
import pandas as pd

languages = ['Indonesian', 'Arabic', 'Mandarin Chinese', 'Greek', 'English',
             'Persian', 'Finnish', 'Spanish', 'Japanese', 'Serbo-Croatian', 'Polish', 'Slovene', 'Thai']

language_codes = ['id', 'ar', 'cmn', 'el', 'en', 'fa', 'fi', 'es', 'ja', 'sh', 'pl', 'sl', 'th']

# Create a dictionary to store the translations and POS for each language
translations = {}
for language in languages:
    translations[language] = {}


# Extract the translations and POS for each language
for entry in data:
    for language, language_code in zip(languages, language_codes):
        senses = entry.get('senses', [])
        for sense in senses:
            translations_list = sense.get('translations', [])
            pos = entry.get('pos')
            for translation in translations_list:
                if translation.get('lang') == language:
                    concept = entry['word']
                    form = translation.get('word')
                    if form is not None:
                        if concept not in translations[language]:
                            translations[language][concept] = {'word': form, 'pos': pos}
                        else:
                            # Check if the POS matches, otherwise set it to 'None'
                            if translations[language][concept]['pos'] != pos:
                                translations[language][concept]['pos'] = 'None'
                            # Update the translation form only if it is not already present
                            if form not in translations[language][concept]['word']:
                                translations[language][concept]['word'] += ', ' + form

output_path = '/content/drive/MyDrive/A Thesis 2023/coding/wiktionary/data/translations_output_wiktionary_backup.tsv'

# Write the translations and POS to a TSV file
with open(output_path, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerow([language_codes[0]] + language_codes[1:])  # Use language codes as the first row

    for concept in translations[languages[0]]:
        row_data = []
        for language, language_code in zip(languages, language_codes):
            translation = translations[language].get(concept, {'word': 'None', 'pos': 'None'})
            if language == 'English':
                row_data.append(concept)
            else:
                row_data.append(', '.join(translations_list))
            row_data.append(translation['word'].split(',')[0] + '(' + translation['pos'] + ')')
        writer.writerow(row_data)
        print(row_data)

print(f'Translations saved to {output_path}')

# Read the TSV file as a dataframe
df = pd.read_csv(output_path, delimiter='\t')

# Print the dataframe
print(df)
