In [1]:
# Import dependencies
import csv
import pandas as pd
import random

## Match Wiktionary data with the main file (wn_msa_data)

In [6]:
def merge_tsv(tsv_file1, tsv_file2, output_file):
    """
    Function to merge the dataframes from both files based on the common columns.
    Param:
        tsv_file1 (str): Path to the first TSV file to be merged.
        tsv_file2 (str): Path to the second TSV file to be merged.
        output_file (str): Path to the output TSV file where the merged data will be saved.
    Returns:
        None
    """
    merged_rows = []

    with open(tsv_file1, 'r', encoding='utf-8') as file1, open(tsv_file2, 'r', encoding='utf-8') as file2:
        reader1 = csv.DictReader(file1, delimiter='\t')
        reader2 = csv.DictReader(file2, delimiter='\t')
        
        # Create a dictionary to store rows from tsv_file1 for quick lookup
        tsv1_rows = {row['synset']: row for row in reader1}
        for row2 in reader2:
            synset = row2['synset']
            if synset in tsv1_rows:
                row1 = tsv1_rows[synset]
                confidence = int(row1['count'])
                language = row1['language']
            else:
                confidence = 'None'
                language = 'None'
            merged_row = {
                'synset': synset,
                'lemma': row2['lemma'],
                'goodness label': row2['label'],
                'confidence': confidence,
                'language': language
            }
            merged_rows.append(merged_row)

    fieldnames = ['synset', 'lemma', 'goodness label', 'confidence', 'language']
    with open(output_file, 'w', encoding='utf-8', newline='') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        writer.writerows(merged_rows)

tsv_file1 = './data/synset_output_wiktionary_with_labels.tsv'
tsv_file2 = './data/wn_msa_data.tsv'
output_file = './data/wn_msa_wiktionary.tsv'

merge_tsv(tsv_file1, tsv_file2, output_file)
print("TSV files merged and saved as:", output_file)

TSV files merged and saved as: ./data/wn_msa_wiktionary.tsv


# Run the Wiktionary data on the main data with the system's best condition to generate predictions

In [26]:
def process_synsets(input_tsv_path, output_tsv_path, threshold):
    """
    Function to process a TSV file, assigning prediction labels based on given criteria.
    Param:
        input_tsv_path (str): Path to the input TSV file.
        output_tsv_path (str): Path to the output TSV file.
        threshold (str): Threshold value for prediction label assignment.
    Returns:
        None
    """
    with open(input_tsv_path, 'r', encoding='utf-8') as tsv_file:
        reader = csv.DictReader(tsv_file, delimiter='\t')
        rows = list(reader)

    with open(output_tsv_path, 'w', newline='', encoding='utf-8') as tsv_file:
        writer = csv.writer(tsv_file, delimiter='\t')
        writer.writerow(['synset', 'lemma', 'prediction label'])

        for row in rows:
            synset = row['synset']
            lemma = row['lemma']
            goodness_label = row['goodness label']
            confidence_score = row['confidence']
            language = row['language']
            
            label = assign_label(synset, confidence_score, language, goodness_label, int(threshold))
            writer.writerow([synset, lemma, label])

    print(f"Prediction labels saved to '{output_tsv_path}' successfully.")
    
## Condition 5
def assign_label(synset, confidence_score, language, goodness_label, threshold):
    """
    Function to assign a label to a synset based on given criteria.

    Param:
        synset (str): The synset to be labeled.
        confidence_score (float): The confidence score associated with the synset.
        language (str): The language of the synset.
        goodness_label (str): The goodness label associated with the synset.
        threshold (str): The threshold value used for comparison with confidence_score.

    Returns:
        str: The label assigned to the synset based on the predefined condition.
    """
    if goodness_label == 'O' and synset.endswith('-v') and confidence_score == '1' and language == 'English':
        return 'DELETE'
    elif goodness_label == 'X' and synset.endswith('-v') and confidence_score == '1' and language == 'English':
        return 'DELETE'
    else:
        return 'KEEP'


input_tsv_path = './data/wn_msa_wiktionary.tsv'
output_tsv_path = './predictions_results/wn-msa-wiktionary.tsv'
threshold = '1'
process_synsets(input_tsv_path, output_tsv_path, threshold)


Prediction labels saved to './predictions_results/wn-msa-wiktionary.tsv' successfully.


## Match OPUS data with the main file (wn_msa_data)

In [27]:
def merge_tsv(tsv_file1, tsv_file2, output_file):
    """
    Function to merge the dataframes from both files based on the common columns.
    Param:
        tsv_file1 (str): Path to the first TSV file to be merged.
        tsv_file2 (str): Path to the second TSV file to be merged.
        output_file (str): Path to the output TSV file where the merged data will be saved.
    Returns:
        None
    """
    merged_rows = []

    with open(tsv_file1, 'r', encoding='utf-8') as file1, open(tsv_file2, 'r', encoding='utf-8') as file2:
        reader1 = csv.DictReader(file1, delimiter='\t')
        reader2 = csv.DictReader(file2, delimiter='\t')
        
        # Create a dictionary to store rows from tsv_file1 for quick lookup
        tsv1_rows = {row['synset']: row for row in reader1}

        for row2 in reader2:
            synset = row2['synset']
            if synset in tsv1_rows:
                row1 = tsv1_rows[synset]
                confidence = int(row1['count'])
                language = row1['language']
            else:
                confidence = 'None'
                language = 'None'

            merged_row = {
                'synset': synset,
                'lemma': row2['lemma'],
                'goodness label': row2['label'],
                'confidence': confidence,
                'language': language
            }
            merged_rows.append(merged_row)

    fieldnames = ['synset', 'lemma', 'goodness label', 'confidence', 'language']
    with open(output_file, 'w', encoding='utf-8', newline='') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        writer.writerows(merged_rows)


tsv_file1 = './data/synset_output_opus_with_labels.tsv'
tsv_file2 = './data/wn_msa_data.tsv'
output_file = './data/wn_msa_opus.tsv'

merge_tsv(tsv_file1, tsv_file2, output_file)
print("TSV files merged and saved as:", output_file)

TSV files merged and saved as: ./data/wn_msa_opus.tsv


# Run the OPUS data on the main data with the system's best condition to generate predictions

In [28]:
def process_synsets(input_tsv_path, output_tsv_path, threshold):
    """
    Function to process a TSV file, assigning prediction labels based on given criteria.
    Param:
        input_tsv_path (str): Path to the input TSV file.
        output_tsv_path (str): Path to the output TSV file.
        threshold (str): Threshold value for prediction label assignment.
    Returns:
        None
    """
    with open(input_tsv_path, 'r', encoding='utf-8') as tsv_file:
        reader = csv.DictReader(tsv_file, delimiter='\t')
        rows = list(reader)

    with open(output_tsv_path, 'w', newline='', encoding='utf-8') as tsv_file:
        writer = csv.writer(tsv_file, delimiter='\t')
        writer.writerow(['synset', 'lemma', 'prediction label'])

        for row in rows:
            synset = row['synset']
            lemma = row['lemma']
            goodness_label = row['goodness label']
            confidence_score = row['confidence']
            language = row['language']
            
            label = assign_label(synset, confidence_score, language, goodness_label, int(threshold))
            writer.writerow([synset, lemma, label])

    print(f"Prediction labels saved to '{output_tsv_path}' successfully.")
    
## Condition 5
def assign_label(synset, confidence_score, language, goodness_label, threshold):
    if goodness_label == 'O' and synset.endswith('-v') and confidence_score == '1' and language == 'English':
        return 'DELETE'
    elif goodness_label == 'X' and synset.endswith('-v') and confidence_score == '1' and language == 'English':
        return 'DELETE'
    else:
        return 'KEEP'


input_tsv_path = './data/wn_msa_opus.tsv'
output_tsv_path = './predictions_results/wn-msa-opus.tsv'
threshold = '1'
process_synsets(input_tsv_path, output_tsv_path, threshold)


Prediction labels saved to './predictions_results/wn-msa-opus.tsv' successfully.


# Taking random sampling for hand-checked

In [2]:
## Wiktionary

input_tsv_path = './predictions_results/wn-msa-wiktionary.tsv'
output_tsv_path = './predictions_results/msa-wiktionary-150-random-sample.tsv'

df = pd.read_csv(input_tsv_path, delimiter="\t")

# Take a random sample of 150 rows
sampled_df = df.sample(n=150, random_state=42, replace=True)  

# Write the sampled data to a new TSV file
sampled_df.to_csv(output_tsv_path, sep="\t", index=False)

print("Sampling and saving successful for Wiktionary!")


## OPUS
input_tsv_path = './predictions_results/wn-msa-opus.tsv'
output_tsv_path = './predictions_results/msa-opus-150-random-sample.tsv'

df = pd.read_csv(input_tsv_path, delimiter="\t")

# Take a random sample of 150 rows
sampled_df = df.sample(n=150, random_state=42, replace=True)

# Write the sampled data to a new TSV file
sampled_df.to_csv(output_tsv_path, sep="\t", index=False)

print("Sampling and saving successful for OPUS!")

Sampling and saving successful for Wiktionary!
Sampling and saving successful for OPUS!


# Data Analysis

In [6]:
# Initialize counters
total_lines = 0
keep_count = 0
delete_count = 0

# Change to wiktionary path too for the second predictions
file_path = "./predictions_results/wn-msa-opus.tsv"

try:
    with open(file_path, "r") as file:
        next(file)
        for line in file:
            total_lines += 1
            _, _, prediction_label = line.strip().split("\t")
            
            # Count the occurrences of "KEEP" and "DELETE"
            if prediction_label == "KEEP":
                keep_count += 1
            elif prediction_label == "DELETE":
                delete_count += 1

except FileNotFoundError:
    print("File not found. Please provide the correct file path.")
    exit(1)

print("Total lines:", total_lines)
print("Number of KEEP predictions:", keep_count)
print("Number of DELETE predictions:", delete_count)

Total lines: 641030
Number of KEEP predictions: 584688
Number of DELETE predictions: 56342
