# JSON Data Extraction
In this step, we:
1. load JSON file
2. save it as a list of dictionaries
3. and save it back to new JSON file with an indentation of 4 spaces for better readability

In [1]:
# Import dependencies
import json
import csv
import pandas as pd
import sys

In [2]:
file_path = './data/kaikki.org-dictionary-English.json'
new_file_path = './data/kaikki_formatted.json'

# Load the JSON data
with open(file_path, encoding='utf-8') as f:
    data = [json.loads(line) for line in f]
    print('JSON file is loaded successfully')

# Save the data as JSON to a new file
with open(new_file_path, 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=4)

print('New JSON file is created successfully')

JSON file is loaded successfully
New JSON file is created successfully


# Extracting Translations
In this step we:
1. extract translations from a JSON file and saves them in a TSV file
2. remove duplicate rows from the TSV file based on specific columns and saves the modified data to a new TSV file

In [3]:
file_path = './data/kaikki_formatted.json'
output_path = './data/senses_extraction.tsv'

languages = {'Indonesian', 'Arabic', 'Mandarin Chinese', 'Greek',
             'Portuguese', 'Finnish', 'Spanish', 'Japanese', 'Serbo-Croatian', 'Polish', 'Slovene', 'Thai'}

# Load the JSON data from the file
with open(file_path, 'r') as file:
    data = json.load(file)

# Prepare the TSV file for writing
with open(output_path, 'w', newline='', encoding='utf-8') as tsv_file:
    writer = csv.writer(tsv_file, delimiter='\t')

    # Write the header row
    header = ['lang', 'word', 'pos', 'sense']
    writer.writerow(header)

    rows_to_write = []

    # Iterate over each entry in the JSON data
    for entry in data:
        pos = entry.get('pos')
        senses = entry.get('senses', [])

        # Extract the English word, sense, and pos if available
        english_word = entry.get('word') if entry.get('lang') == 'English' else 'None'

        # Iterate over each sense and extract translations
        for sense in senses:
            if 'translations' in sense:
                # Iterate over translations
                for translation in sense['translations']:
                    lang = translation.get('lang')
                    if english_word is not None:
                        sense_row = translation.get('sense')
                        word_row = translation.get('word')
                        rows_to_write.append(['English', english_word, pos, sense_row])
                    if lang in languages:
                        sense_row = translation.get('sense')
                        word_row = translation.get('word')
                        rows_to_write.append([lang, word_row, pos, sense_row])

        # Check if translations are available outside the senses key
        if 'translations' in entry:
            for translation in entry['translations']:
                lang = translation.get('lang')
                word_row = translation.get('word')
                if english_word is not None:
                    sense_row = translation.get('sense')
                    rows_to_write.append(['English', english_word, pos, sense_row])
                if lang in languages:
                    sense_row = translation.get('sense')
                    rows_to_write.append([lang, word_row, pos, sense_row])

    # Write all rows at once
    writer.writerows(rows_to_write)

print('Translations extracted and saved in', output_path)

# Start filtering
senses_file = './data/senses_extraction.tsv'
filtered_file = './data/senses_extraction_modified.tsv'

df = pd.read_csv(senses_file, delimiter='\t').drop_duplicates(subset=['lang', 'pos', 'sense', 'word'])
df.to_csv(filtered_file, sep='\t', index=False, quoting=csv.QUOTE_NONNUMERIC)

print('Translations filtered and saved in', filtered_file)


Translations extracted and saved in ./data/senses_extraction.tsv
Translations filtered and saved in ./data/senses_extraction_modified.tsv


# Mapping words based on senses
In this step, we:
1. filter and process the data based on certain conditions
3. map translations (words) to each sense by organizing them in a dictionary
2. generate a new TSV file with labeled rows containing words for specific senses and languages

In [4]:
filename = './data/senses_extraction_modified.tsv'
output_path = './data/output_senses.tsv'

with open(filename, 'r', encoding='utf-8') as tsv_file:
    reader = csv.reader(tsv_file, delimiter='\t')
    labeled_rows = {}
    completed_senses = set()
    # Iterate over the rows
    for row in reader:
        lang = row[0]
        word = row[1]
        sense = row[3]
        pos = row[2]  # New line to extract the POS
        # Check if the sense has already been completed
        if sense in completed_senses:
            continue
        # Check if the sense already exists in the labeled rows
        if sense in labeled_rows:
            data = labeled_rows[sense]
            # Check if the word exists in any language for the sense
            if any(word in values for values in data.values()):
                continue
        else:
            data = {}

        # Add the word and POS to the labeled rows
        data.setdefault(lang, []).append(word)
        data['POS'] = pos  # New line to add the POS
        labeled_rows[sense] = data

        # Check if the sense now has words for all 13 languages
        if len(data) == 14:  # Adjusted the condition to account for the added POS
            completed_senses.add(sense)
            if len(completed_senses) == len(labeled_rows):
                break
    # Remove rows where there is no content in the Indonesian row
    labeled_rows = {sense: data for sense, data in labeled_rows.items() if 'Indonesian' in data}

    # Create a new TSV file to save the labeled rows
    with open(output_path, 'w', encoding='utf-8', newline='') as output_file:
        writer = csv.writer(output_file, delimiter='\t')
        languages = sorted(set(lang for data in labeled_rows.values() for lang in data.keys()))
        writer.writerow(['Sense'] + languages + ['POS'])  # Added 'POS' to the header row
        # Write the labeled rows
        for sense, data in labeled_rows.items():
            row = [sense]
            for lang in languages:
                words = ' '.join(data.get(lang, []))
                if not words:
                    words = 'None'
                row.append(words)
            row.append(data.get('POS', 'None'))  # Added the POS to the row
            writer.writerow(row)

print('Translation mapped and saved in', output_path)


Translation mapped and saved in ./data/output_senses.tsv


# Start filtering process 
In this step, we:
1. process its content
2. generate a new TSV file with the same header row and filtered data where each field contains either the first word from the original field, the single word itself, or 'None' if there are no words
3. the last row 'lang' is then manually deleted and the 'Indonesian' row is moved to the first row for intersection alogrithm

In [7]:
# Increase the field size limit
csv.field_size_limit(sys.maxsize)

def process_tsv_file(input_file, output_file):
    """
    Function to process a TSV file, extracting the first word from each cell of the second and subsequent columns.

    Param:
        input_file (str): The path to the input TSV file to be processed.
        output_file (str): The path to the output TSV file to save the processed data.

    Returns:
        None
    """
    with open(input_file, 'r', encoding='utf-8', newline='') as file:
        reader = csv.reader(file, delimiter='\t')
        header = next(reader)  
        output_data = [header[1:]]  

        # Process each row except the first one
        for row in reader:
            new_row = []
            for item in row[1:]:  
                words = item.split()  
                if len(words) > 1:
                    new_row.append(words[0])  
                elif len(words) == 1:
                    new_row.append(words[0])  
                else:
                    new_row.append(None)  
            output_data.append(new_row)

    with open(output_file, 'w', encoding='utf-8', newline='') as file:
        writer = csv.writer(file, delimiter='\t')
        writer.writerows(output_data)  

input_file = './data/output_senses.tsv'
output_file = './data/output.tsv'
process_tsv_file(input_file, output_file)

print('Translation mapped and saved in', output_file)


Translation mapped and saved in ./data/output.tsv


# Dropping rows and move Indonesian row to the first one
In this step we:
1. drop rows that we are not going to use in the intersection
2. move Indonesian row to the first row

In [18]:
tsv_file_path = './data/output.tsv'

df = pd.read_csv(tsv_file_path, sep='\t')
df = df.drop(['POS', 'lang', 'POS'], axis=1)
df = df.drop(['POS.1'], axis=1)

output_file_path = './data/output_wiktionary.tsv'

# Find the index of the Indonesian row
indonesian_index = df.columns.get_loc('Indonesian')

# Move the Indonesian row to the first position
df = df[[df.columns[indonesian_index]] + list(df.columns[:indonesian_index]) + list(df.columns[indonesian_index+1:])]

# Save the modified DataFrame back to a file
df.to_csv('./data/output_wiktionary.tsv', index=False, sep='\t')

print('Rows dropped and saved in', output_file_path)

Rows dropped and saved in ./data/output_wiktionary.tsv
