# Import dependencies

In [2]:
import sys
import csv
import json
import csv
import nltk
from nltk.corpus import wordnet as wn
nltk.download('extended_omw')

[nltk_data] Downloading package extended_omw to
[nltk_data]     /Users/sitinurhalimah/nltk_data...
[nltk_data]   Package extended_omw is already up-to-date!


True

# Loop through to find synset for each language
In this step, we:
1. retrieve synset suggestions for each word in the supported WordNet languages
2. save the suggestions in a JSON file

In [4]:
# Increase the field size limit
csv.field_size_limit(sys.maxsize)

language_codes_mapping = {
    'Indonesian': 'ind',
    'Arabic': 'arb',
    'Mandarin Chinese': 'cmn',
    'Greek': 'ell',
    'English': 'eng',
    'Portuguese': 'por',
    'Finnish': 'fin',
    'Spanish': 'spa',
    'Japanese': 'jpn',
    'Serbo-Croatian': 'hrv',
    'Polish': 'pol',
    'Slovene': 'slv',
    'Thai': 'tha'
}

def get_synset_suggestions(word, language_code):
    """
    Function to get synset suggestions for a given word and language.
    Param:
        word (str): The word for which to find synsets.
        language_code (str): The language code for the synsets (e.g., 'eng' for English, 'spa' for Spanish).
    Returns:
        list: A list of synset suggestions, where each suggestion is in the format "{language_code}: {offset}".
              If no synsets are found or an error occurs, an empty list is returned.
    """
    synset_suggestions = []
    try:
        synsets = wn.synsets(word, lang=language_code)
        for synset in synsets:
            offset = str(synset.offset()).zfill(8) + '-' + synset.pos()
            synset_suggestions.append(f"{language_code}: {offset}")
    except (Exception, IndexError):
        pass  # Ignore errors when synsets are not found
    return synset_suggestions

data_path = './data/output_wiktionary.tsv'
output_path = './data/output_wiktionary.json'

with open(data_path, 'r', encoding='utf-8') as file:
    reader = csv.reader(file, delimiter='\t')
    headers = next(reader)
    language_codes = headers[1:]  # Exclude the 'id' column
    data = list(reader)

output_data = {}

for row in data:
    word = row[0]
    synset_suggestions = {}
    for i, language_code in enumerate(language_codes):
        if language_code in language_codes_mapping:
            wordnet_language = language_codes_mapping[language_code]
            synsets = get_synset_suggestions(row[i+1], wordnet_language)
            if synsets:
                synset_suggestions[language_code] = synsets

    if synset_suggestions:
        output_data[word] = synset_suggestions

with open(output_path, 'w', encoding='utf-8') as file:
    json.dump(output_data, file, ensure_ascii=False, indent=4)

print(f"Offset suggestions saved to '{output_path}' successfully.")


Offset suggestions saved to './data/output_wiktionary.json' successfully.


# Merged results to see suggested synsets
In this step, we:
1. process the synset suggestions obtained from the JSON file
2. consolidate them, and generate a count of how many languages suggest each synset for each Indonesian word
3. the resulting count data is then saved in a TSV file

In [5]:
with open('./data/output_wiktionary.json', 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

count_data = []
for indonesian_word, suggestions in data.items():
    merged_synsets = {}
    for language, synsets in suggestions.items():
        for synset in synsets:
            synset_key = synset.split(':')[1].strip()
            if synset_key not in merged_synsets:
                merged_synsets[synset_key] = set()

            merged_synsets[synset_key].add(language)

    for synset, languages in merged_synsets.items():
        count = len(languages)
        count_data.append({
            'synset': synset,
            'language': ', '.join(languages),
            'lemma': indonesian_word,
            'count': count
        })

filename = './data/synsets_output_wiktionary.tsv'

with open(filename, 'w', newline='', encoding='utf-8') as tsv_file:
    writer = csv.writer(tsv_file, delimiter='\t')
    writer.writerow(['synset', 'language', 'lemma', 'count'])

    for item in count_data:
        writer.writerow([item['synset'], item['language'], item['lemma'], item['count']])

print('Synset suggestions count saved to', filename)


Synset suggestions count saved to ./data/synsets_output_wiktionary.tsv


# Adding goodness labels
In this step, we:
1. add goodness labels (Y, O, X and L) into the suggested synsets if they are match with the ones found in master data obtained from SourceForge (https://sourceforge.net/p/wn-msa/tab/HEAD/tree/trunk/)

In [6]:
synset_tsv_file= './data/synsets_output_wiktionary.tsv'
main_data_tsv_file = './data/wn_msa_data.tsv'
output_tsv_file = './data/synset_output_wiktionary_with_labels.tsv'


labels = {}
with open(main_data_tsv_file, 'r', encoding='utf-8') as f:
    reader = csv.reader(f, delimiter='\t')
    next(reader)  
    for row in reader:
        synset = row[0]
        label = row[1]
        lemma = row[2]
        labels[(synset, lemma)] = label

# Update the first TSV file with the labels
with open(synset_tsv_file, 'r', encoding='utf-8') as f:
    reader = csv.reader(f, delimiter='\t')
    header = next(reader)  
    header.append('goodness labels')  
    rows = []
    for row in reader:
        synset = row[0]
        lemma = row[2]
        if (synset, lemma) in labels:
            row.append(labels[(synset, lemma)])
        else:
            row.append('None')
        rows.append(row)

# Save the updated data to the output TSV file
with open(output_tsv_file, 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerow(header)  
    writer.writerows(rows) 

print('Updated TSV file saved as', output_tsv_file)

Updated TSV file saved as ./data/synset_output_wiktionary_with_labels.tsv


# Merging File

In [7]:
def merge_tsv(tsv_file1, tsv_file2, output_file):
    """
    Function to merge data from two TSV files based on a common 'synset' and 'lemma' fields,
    and create a new TSV file containing the merged data.

    Param:
        tsv_file1 (str): The path to the first input TSV file.
        tsv_file2 (str): The path to the second input TSV file.
        output_file (str): The path to the output TSV file to save the merged data.

    Returns:
        None
    """
    merged_rows = []

    with open(tsv_file1, 'r', encoding='utf-8') as file1, open(tsv_file2, 'r', encoding='utf-8') as file2:
        reader1 = csv.DictReader(file1, delimiter='\t')
        reader2 = csv.DictReader(file2, delimiter='\t')
        
        # Create a dictionary to store rows from tsv_file1 for quick lookup
        tsv1_rows = {row['synset']: row for row in reader1}

        for row2 in reader2:
            synset = row2['synset']
            if synset in tsv1_rows:
                row1 = tsv1_rows[synset]
                confidence = int(row1['count'])
                language = row1['language']
            else:
                confidence = 'None'
                language = 'None'

            merged_row = {
                'synset': synset,
                'lemma': row2['lemma'],
                'annotation': row2['annotation'],
                'goodness label': row2['goodness label'],
                'confidence': confidence,
                'language': language
            }
            merged_rows.append(merged_row)

    fieldnames = ['synset', 'lemma', 'annotation', 'goodness label', 'confidence', 'language']
    with open(output_file, 'w', encoding='utf-8', newline='') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        writer.writerows(merged_rows)

tsv_file1 = './data/synset_output_wiktionary_with_labels.tsv'
tsv_file2 = './data/development_set_with_labels.tsv'
output_file = './data/merged_file_wiktionary.tsv'

merge_tsv(tsv_file1, tsv_file2, output_file)
print("TSV files merged and saved as:", output_file)

TSV files merged and saved as: ./data/merged_file_wiktionary.tsv
