In [2]:
# Import dependencies
import csv
from nltk.corpus import wordnet as wn

In [14]:
def get_gloss(offset):
    """
    Function to retrieve the glos of a WordNet synset using its offset.

    Param:
    offset (str): A string containing the synset offset and part-of-speech tag in the format "offset-pos".

    Returns:
    None
    """
    synset = wn.synset_from_pos_and_offset(offset[-1], int(offset[:-2]))
    if synset:
        return synset.definition()
    else:
        return None

def process_tsv(input_file, output_file):
    """
    Function to process a TSV file containing synset offsets and add their glosses as an additional column.

    Param:
    input_file (str): The path to the input TSV file.
    output_file (str): The path to the output TSV file where the processed data will be saved.
    
    Return:
    None
    """
    data = []

    # Read the input TSV file
    with open(input_file, 'r', encoding='utf-8') as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        header = next(reader)  
        data.append(header + ['gloss'])  

        for row in reader:
            synset = row[0]
            definition = get_gloss(synset)
            data.append(row + [definition])

    # Write the output TSV file
    with open(output_file, 'w', newline='', encoding='utf-8') as tsvfile:
        writer = csv.writer(tsvfile, delimiter='\t')
        writer.writerows(data)

    print(f"Definitions added and saved to '{output_file}' successfully.")


input_file = './predictions_results/msa-wiktionary-150-random-sample.tsv'
output_file = './predictions_results/msa-wiktionary-150-sample_with_gloss.tsv'
process_tsv(input_file, output_file)


Definitions added and saved to './predictions_results/msa-wiktionary-150-sample_with_gloss.tsv' successfully.


In [19]:
## Generate OPUS data

tsv1_file = './predictions_results/msa-wiktionary-150-sample_with_gloss.tsv'
tsv2_file = "./predictions_results/wn-msa-opus.tsv"
new_tsv_file = './predictions_results/msa-opus-150-sample_with_gloss.tsv' 

tsv1_data = []
with open(tsv1_file, "r", encoding='cp1252') as file:
    tsv1_reader = csv.reader(file, delimiter="\t")
    for row in tsv1_reader:
        tsv1_data.append(row)

tsv2_data = {}
with open(tsv2_file, "r", encoding="utf-8") as file:
    tsv2_reader = csv.reader(file, delimiter="\t")
    next(tsv2_reader)  
    for row in tsv2_reader:
        synset = row[0]
        lemma = row[1]
        prediction_label = row[2]
        tsv2_data[(synset, lemma)] = prediction_label

for i in range(len(tsv1_data)):
    synset = tsv1_data[i][0]
    lemma = tsv1_data[i][1]
    key = (synset, lemma)
    if key in tsv2_data:
        tsv1_data[i][2] = tsv2_data[key]

        
with open(new_tsv_file, "w", newline="", encoding="utf-8") as file:
    tsv_writer = csv.writer(file, delimiter="\t")
    tsv_writer.writerows(tsv1_data)

print("New TSV file has been created with updated prediction labels.")


New TSV file has been created with updated prediction labels.


In [15]:
## Wiktionary
import pandas as pd


file_path = './predictions_results/msa-wiktionary-150-sample_with_gloss.tsv'

df = pd.read_csv(file_path, sep='\t', encoding='cp1252')

# Filter the rows where the hand-checked label is 'T' (KEEP) or 'F' (DELETE)
correct_predictions = df[df['hand-checked'].isin(['T', 'F'])]


# Calculate the accuracy for 'KEEP' predictions
correct_keep_predictions = correct_predictions[correct_predictions['hand-checked'] == 'T']
print(correct_keep_predictions)
accuracy_keep = len(correct_keep_predictions[correct_keep_predictions['prediction label'] == 'KEEP']) / len(correct_keep_predictions)

# Calculate the accuracy for 'DELETE' predictions
correct_delete_predictions = correct_predictions[correct_predictions['hand-checked'] == 'F']
accuracy_delete = len(correct_delete_predictions[correct_delete_predictions['prediction label'] == 'DELETE']) / len(correct_delete_predictions)

# Calculate the overall accuracy rate
total_correct_predictions = len(correct_keep_predictions[correct_keep_predictions['prediction label'] == 'KEEP']) + len(correct_delete_predictions[correct_delete_predictions['prediction label'] == 'DELETE'])
total_hand_checked = len(correct_predictions)
overall_accuracy = total_correct_predictions / total_hand_checked

print("Accuracy for 'KEEP' predictions:", accuracy_keep)
print("Accuracy for 'DELETE' predictions:", accuracy_delete)
print("Overall accuracy rate:", overall_accuracy)


         synset           lemma prediction label  \
2    02512305-v   menyebelahkan           DELETE   
3    01714157-a    terbengkalai             KEEP   
4    00701040-v       pengaruhi             KEEP   
5    00327362-v      memandikan             KEEP   
6    00879356-v       memilukan             KEEP   
..          ...             ...              ...   
143  00253270-n  penyucian diri             KEEP   
145  00370869-a          berlau             KEEP   
146  04550184-n    almari kecil             KEEP   
148  07841037-n           abiad             KEEP   
149  01266269-v  cara berdandan             KEEP   

                                       gloss for sense  \
2        treat differently on the basis of sex or race   
3               lacking a surface finish such as paint   
4                shape or influence; give direction to   
5    let sit in a liquid to extract a flavor or to ...   
6    propose formally; in a debate or parliamentary...   
..                         

In [20]:
## OPUS
import pandas as pd

# Provide the correct file path where the modified TSV file is located
file_path = './predictions_results/msa-opus-150-sample_with_gloss.tsv' 

# Read the modified TSV file into a DataFrame with the correct encoding
df = pd.read_csv(file_path, sep='\t', encoding='cp1252')

# Filter the rows where the hand-checked label is 'T' (KEEP) or 'F' (DELETE)
correct_predictions = df[df['hand-checked'].isin(['T', 'F'])]


# Calculate the accuracy for 'KEEP' predictions
correct_keep_predictions = correct_predictions[correct_predictions['hand-checked'] == 'T']
print(correct_keep_predictions)
accuracy_keep = len(correct_keep_predictions[correct_keep_predictions['prediction label'] == 'KEEP']) / len(correct_keep_predictions)

# Calculate the accuracy for 'DELETE' predictions
correct_delete_predictions = correct_predictions[correct_predictions['hand-checked'] == 'F']
accuracy_delete = len(correct_delete_predictions[correct_delete_predictions['prediction label'] == 'DELETE']) / len(correct_delete_predictions)

# Calculate the overall accuracy rate
total_correct_predictions = len(correct_keep_predictions[correct_keep_predictions['prediction label'] == 'KEEP']) + len(correct_delete_predictions[correct_delete_predictions['prediction label'] == 'DELETE'])
total_hand_checked = len(correct_predictions)
overall_accuracy = total_correct_predictions / total_hand_checked

print("Accuracy for 'KEEP' predictions:", accuracy_keep)
print("Accuracy for 'DELETE' predictions:", accuracy_delete)
print("Overall accuracy rate:", overall_accuracy)


         synset           lemma prediction label  \
2    02512305-v   menyebelahkan           DELETE   
3    01714157-a    terbengkalai             KEEP   
4    00701040-v       pengaruhi             KEEP   
5    00327362-v      memandikan             KEEP   
6    00879356-v       memilukan             KEEP   
..          ...             ...              ...   
143  00253270-n  penyucian diri             KEEP   
145  00370869-a          berlau             KEEP   
146  04550184-n    almari kecil             KEEP   
148  07841037-n           abiad             KEEP   
149  01266269-v  cara berdandan             KEEP   

                                       gloss for sense  \
2        treat differently on the basis of sex or race   
3               lacking a surface finish such as paint   
4                shape or influence; give direction to   
5    let sit in a liquid to extract a flavor or to ...   
6    propose formally; in a debate or parliamentary...   
..                         