In [None]:
# Import dependencies
import pandas as pd

In [28]:
def merge_tsv_files(tsv_file1, tsv_file2, output_file):
    """
    Function to merge the dataframes from both files based on the common columns.
    Param:
        tsv_file1 (str): Path to the first TSV file to be merged.
        tsv_file2 (str): Path to the second TSV file to be merged.
        output_file (str): Path to the output TSV file where the merged data will be saved.
    Returns:
        None
    """
    df1 = pd.read_csv(tsv_file1, sep='\t')
    df2 = pd.read_csv(tsv_file2, sep='\t')

    merged_df = pd.merge(df1, df2[['synset', 'lemma', 'annotation', 'prediction label']], on=['synset', 'lemma', 'annotation'], how='left')
    merged_df.to_csv(output_file, sep='\t', index=False)

if __name__ == "__main__":
    tsv_file1 = './data/merged_file_wiktionary.tsv'
    tsv_file2 = './predictions_results/wiktionary_condition_five.tsv'
    output_file = "./data/error_analysis_wiktionary.tsv"

    merge_tsv_files(tsv_file1, tsv_file2, output_file)

In [82]:
file_path = './data/error_analysis_wiktionary.tsv'

df = pd.read_csv(file_path, delimiter='\t')
filtered_rows = []
for index, row in df.iterrows():
    annotation = row['annotation']
    prediction = row['prediction label']
    goodness_label = row['goodness label']
    language = row['language']
    confidence = row['confidence']

    if annotation == 'DELETE' and prediction == 'KEEP' and confidence == 'None' and language == 'None' :
        filtered_rows.append(row)

result_df = pd.DataFrame(filtered_rows)

print(len(result_df))

# Total = 958
#confidence 1 = 697

# Arabic: 48
# Croatian: 0
# Thai: 1
# Slovene: 51
# Finnish: 252
# Portuguese: 62
# Japanese: 24
# Polish: 11
# Greek: 19
# Mandarin Chinese: 15
# English: 116
# Spanish: 87
#None: 110

110


In [101]:
file_path = './data/error_analysis_wiktionary.tsv'
df = pd.read_csv(file_path, delimiter='\t')
filtered_rows = []
language_counts = {}
for index, row in df.iterrows():
    annotation = row['annotation']
    prediction = row['prediction label']
    goodness_label = row['goodness label']
    language = row['language']
    confidence = row['confidence']
    
    if annotation == 'DELETE' and prediction == 'KEEP' :
        filtered_rows.append(row)
        if language in language_counts:
            language_counts[language] += 1
        else:
            language_counts[language] = 1

result_df = pd.DataFrame(filtered_rows)

print(len(result_df))
print("Total:", len(result_df))
print("Confidence 1:", len(result_df[result_df['confidence'] == 1]))

for language, count in language_counts.items():
        print(f"{language}: {count}")


958
Total: 958
Confidence 1: 0
Portuguese: 62
Finnish: 252
None: 110
Portuguese, Spanish: 27
Arabic: 48
English: 116
Spanish: 87
Spanish, Finnish: 19
Finnish, English: 12
Portuguese, Finnish, Thai, Japanese: 4
Japanese: 24
Polish, English: 9
Greek, English: 13
Slovene: 51
Greek: 19
Portuguese, English: 8
Serbo-Croatian: 11
Polish: 11
Mandarin Chinese: 15
Finnish, Japanese: 5
Greek, Finnish: 5
Portuguese, Spanish, English: 6
Mandarin Chinese, Thai: 6
Portuguese, Finnish: 2
Polish, Finnish, English, Japanese: 1
Slovene, Spanish, Greek, Finnish, Portuguese, Mandarin Chinese, English, Serbo-Croatian: 3
Thai: 1
Polish, Japanese, Finnish, Portuguese, English: 1
Greek, Finnish, English, Serbo-Croatian: 2
Spanish, Japanese, Finnish, Portuguese, English: 5
Portuguese, Finnish, English: 2
English, Thai: 2
Slovene, Finnish, English: 3
Slovene, Spanish, Polish, Japanese, Greek, Finnish, Thai, Portuguese, Mandarin Chinese: 1
Slovene, Spanish, Polish, Greek, Portuguese, English: 1
Portuguese, Englis

In [94]:
file_path = './data/error_analysis_wiktionary.tsv'
df = pd.read_csv(file_path, delimiter='\t')
filtered_rows = []
language_counts = {}

other_count = 0
for index, row in df.iterrows():
    annotation = row['annotation']
    prediction = row['prediction label']
    goodness_label = row['goodness label']
    language = row['language']
    confidence = row['confidence']

    if annotation == 'DELETE' and prediction == 'KEEP' :
        filtered_rows.append(row)
        if language in language_counts:
            language_counts[language] += 1
        else:
            language_counts[language] = 1

for language, count in language_counts.items():
    if count == 1:
        other_count += 1

result_df = pd.DataFrame(filtered_rows)

if other_count != '!':
    language_counts['Others'] = other_count

print(len(result_df))
print("Total:", len(result_df))
print("Confidence 1:", len(result_df[result_df['confidence'] == '1']))

for language, count in language_counts.items():
    print(f"{language}: {count}")

958
Total: 958
Confidence 1: 697
Portuguese: 62
Finnish: 252
None: 110
Portuguese, Spanish: 27
Arabic: 48
English: 116
Spanish: 87
Spanish, Finnish: 19
Finnish, English: 12
Portuguese, Finnish, Thai, Japanese: 4
Japanese: 24
Polish, English: 9
Greek, English: 13
Slovene: 51
Greek: 19
Portuguese, English: 8
Serbo-Croatian: 11
Polish: 11
Mandarin Chinese: 15
Finnish, Japanese: 5
Greek, Finnish: 5
Portuguese, Spanish, English: 6
Mandarin Chinese, Thai: 6
Portuguese, Finnish: 2
Polish, Finnish, English, Japanese: 1
Slovene, Spanish, Greek, Finnish, Portuguese, Mandarin Chinese, English, Serbo-Croatian: 3
Thai: 1
Polish, Japanese, Finnish, Portuguese, English: 1
Greek, Finnish, English, Serbo-Croatian: 2
Spanish, Japanese, Finnish, Portuguese, English: 5
Portuguese, Finnish, English: 2
English, Thai: 2
Slovene, Finnish, English: 3
Slovene, Spanish, Polish, Japanese, Greek, Finnish, Thai, Portuguese, Mandarin Chinese: 1
Slovene, Spanish, Polish, Greek, Portuguese, English: 1
Portuguese, Engl