In [None]:
# Import dependencies
import pandas as pd

In [1]:
def merge_tsv_files(tsv_file1, tsv_file2, output_file):
    """
    Function to merge the dataframes from both files based on the common columns.
    Param:
        tsv_file1 (str): Path to the first TSV file to be merged.
        tsv_file2 (str): Path to the second TSV file to be merged.
        output_file (str): Path to the output TSV file where the merged data will be saved.
    Returns:
        None
    """
    df1 = pd.read_csv(tsv_file1, sep='\t')
    df2 = pd.read_csv(tsv_file2, sep='\t')

    merged_df = pd.merge(df1, df2[['synset', 'lemma', 'annotation', 'prediction label']], on=['synset', 'lemma', 'annotation'], how='left')
    merged_df.to_csv(output_file, sep='\t', index=False)

if __name__ == '__main__':
    tsv_file1 = './data/merged_file_opus.tsv'
    tsv_file2 = './predictions_results/opus_condition_five.tsv'
    output_file = "./data/error_analysis_opus.tsv"

    merge_tsv_files(tsv_file1, tsv_file2, output_file)

In [7]:
file_path = './data/error_analysis_opus.tsv'

df = pd.read_csv(file_path, delimiter='\t')
filtered_rows = []
for index, row in df.iterrows():
    annotation = row['annotation']
    prediction = row['prediction label']
    goodness_label = row['goodness label']
    language = row['language']
    if annotation == 'DELETE' and prediction == 'KEEP':
        filtered_rows.append(row)

result_df = pd.DataFrame(filtered_rows)
print(result_df)

#X = 102
#O = 499

          synset               lemma annotation goodness label confidence  \
17    03670849-n               dawai     DELETE              O          1   
53    00524682-v    menjelma menjadi     DELETE              X          1   
62    01156834-v         mencernakan     DELETE              O          1   
67    08588152-n     sebelah dlm,dlm     DELETE              X          1   
94    02620587-v    terdiri daripada     DELETE              X          1   
...          ...                 ...        ...            ...        ...   
7303  00605516-a  mengikut kebiasaan     DELETE              X       None   
7309  13863771-n               jalur     DELETE              O          1   
7313  13910384-n             ruangan     DELETE              Y          1   
7322  01692969-a             ke luar     DELETE           None          1   
7329  08430568-n              rangka     DELETE              O          1   

     language prediction label  
17    Spanish             KEEP  
53    Fin

In [8]:
file_path = './data/error_analysis_opus.tsv'
df = pd.read_csv(file_path, delimiter='\t')

filtered_rows = []
language_counts = {}

for index, row in df.iterrows():
    annotation = row['annotation']
    prediction = row['prediction label']
    goodness_label = row['goodness label']
    language = row['language']
    confidence = row['confidence'] 
    if annotation == 'DELETE' and prediction == 'KEEP' :
        filtered_rows.append(row)
        if language in language_counts:
            language_counts[language] += 1
        else:
            language_counts[language] = 1

result_df = pd.DataFrame(filtered_rows)

print(len(result_df))
print("Total:", len(result_df))
print("Confidence 1:", len(result_df[result_df['confidence'] == 1]))

for language, count in language_counts.items():
        print(f"{language}: {count}")

733
Total: 733
Confidence 1: 0
Spanish: 182
Finnish: 96
English: 256
Slovene: 10
Portuguese: 25
None: 67
Arabic: 41
Thai: 10
Greek: 35
Japanese, English: 3
Japanese: 1
English, Spanish: 4
English, Slovene: 1
Slovene, Portuguese: 1
Polish: 1
