## trial set:

### parse sentences and obtain PoS for complex word in new file

In [57]:
import csv
from nltk import pos_tag, word_tokenize

def get_treebank_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'JJ'
    elif treebank_tag.startswith('V'):
        return 'VB'
    elif treebank_tag.startswith('N'):
        return 'NN'
    elif treebank_tag.startswith('R'):
        return 'RB'
    else:
        return ''

# open input file
with open('./data/trial/tsar2022_en_trial_none_no_noise.tsv', 'r') as infile:
    reader = csv.reader(infile, delimiter='\t')

    # open output file
    with open('./data/trial/trial_pos.tsv', 'w') as outfile:
        writer = csv.writer(outfile, delimiter='\t')
        
        # loop over rows in the input file
        for row in reader:
            sentence = row[0]
            complex_word = row[1]

            # tokenize sentence and get POS tags
            tagged_sentence = pos_tag(word_tokenize(sentence))

            # find the POS of the complex word and write the sentence, complex word, and the pos tag of the complex word to the output file
            for word, pos in tagged_sentence:
                if word.lower() == complex_word.lower():
                    # Get the Treebank POS and write to the output file
                    treebank_pos = get_treebank_pos(pos)
                    writer.writerow([sentence, complex_word, treebank_pos])
                    break

## for CEFR-J dataset:

### for model SG_MA_SS_bsRobertalarge_robertabase:

### lemmatized:

In [131]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# convert the CEFR dataframe into a dictionary for efficient lookups
cefr_dict = cefr_df.set_index('word')['cefr'].to_dict()

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    print(f"substitutes with CEFR levels mapped to numerical values: {substitutes_cefr}\n")

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked substitutes based on their CEFR level mapped to numerical values: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrj_lem.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrj_lem exported to csv in path './predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrj_lem.tsv'\n")



substitutes with CEFR levels mapped to numerical values: [('mandatory', 7), ('required', 7), ('mandated', 7), ('necessary', 2), ('voluntary', 5), ('optional', 4), ('enforced', 7), ('conditional', 5), ('obligatory', 7), ('forced', 7)]

ranked substitutes based on their CEFR level mapped to numerical values: [('necessary', 2), ('optional', 4), ('voluntary', 5), ('conditional', 5), ('mandatory', 7), ('required', 7), ('mandated', 7), ('enforced', 7), ('obligatory', 7), ('forced', 7)]

substitutes with CEFR levels mapped to numerical values: [('infused', 7), ('filled', 7), ('injected', 7), ('bolstered', 7), ('stirred', 7), ('inspired', 7), ('provided', 3), ('pumped', 7), ('seeded', 7), ('reassured', 7)]

ranked substitutes based on their CEFR level mapped to numerical values: [('provided', 3), ('infused', 7), ('filled', 7), ('injected', 7), ('bolstered', 7), ('stirred', 7), ('inspired', 7), ('pumped', 7), ('seeded', 7), ('reassured', 7)]

substitutes with CEFR levels mapped to numerical val

In [None]:
python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrj_lem.tsv --output_file ./output/trial/SS_bsRobertalarge_robertabase_SR_cefrj_lem.tsv

In [None]:
=========   EVALUATION config.=========
GOLD file = ./data/trial/tsar2022_en_trial_gold_no_noise.tsv
PREDICTION LABELS file = ./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrj_lem.tsv
OUTPUT file = ./output/trial/SS_bsRobertalarge_robertabase_SR_cefrj_lem.tsv
===============   RESULTS  =============

MAP@1/Potential@1/Precision@1 = 0.6

MAP@3 = 0.3777
MAP@5 = 0.2726
MAP@10 = 0.1618

Potential@3 = 0.8
Potential@5 = 0.8
Potential@10 = 1.0

Accuracy@1@top_gold_1 = 0.2
Accuracy@2@top_gold_1 = 0.4
Accuracy@3@top_gold_1 = 0.5

### with pos tag of the complex word taken into account:
(each lemmatized substitute is mapped to its CEFR level, and the specific CEFR level is included only if the part of speech (POS) of the word (in ./cefrj/cefrj_all_treebank.tsv) matches the POS of the complex word)

In [64]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# read the pos tags of complex words
complex_word_pos_df = pd.read_csv('./data/trial/trial_pos.tsv', sep='\t', header=None, names=['sentence', 'complex_word', 'pos'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    
    # Get the pos of the complex_word
    complex_word_pos = complex_word_pos_df[complex_word_pos_df['complex_word'] == complex_word]['pos'].values[0]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # if the lemmatized substitute word is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the complex word:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == complex_word_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrj_lem_pos.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrj_lem_pos exported to csv in path './predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrj_lem_pos.tsv'\n")


ranked_cefr_subs: [('necessary', 2), ('optional', 4), ('voluntary', 5), ('conditional', 5), ('mandatory', 7), ('required', 7), ('mandated', 7), ('enforced', 7), ('obligatory', 7), ('forced', 7)]

ranked_cefr_subs: [('infused', 7), ('filled', 7), ('injected', 7), ('bolstered', 7), ('stirred', 7), ('inspired', 7), ('provided', 7), ('pumped', 7), ('seeded', 7), ('reassured', 7)]

ranked_cefr_subs: [('creatures', 2), ('monsters', 3), ('murderers', 3), ('idiots', 4), ('thugs', 7), ('nazis', 7), ('hawks', 7), ('terrorists', 7), ('criminals', 7), ('perpetrators', 7)]

ranked_cefr_subs: [('reporters', 1), ('monitors', 3), ('observations', 3), ('supporters', 3), ('inspectors', 4), ('investigators', 4), ('witnesses', 4), ('analysts', 4), ('responders', 7), ('experts', 7)]

ranked_cefr_subs: [('bullet', 7), ('stab', 7), ('gunshot', 7), ('projectile', 7), ('blast', 7), ('mortar', 7), ('shell', 7), ('grenade', 7), ('shotgun', 7), ('ammunition', 7)]

ranked_cefr_subs: [('masked', 7), ('concealed', 7

In [None]:
python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrj_lem_pos.tsv --output_file ./output/trial/SS_bsRobertalarge_robertabase_SR_cefrj_lem_pos.tsv

In [None]:
=========   EVALUATION config.=========
GOLD file = ./data/trial/tsar2022_en_trial_gold_no_noise.tsv
PREDICTION LABELS file = ./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrj_lem_pos.tsv
OUTPUT file = ./output/trial/SS_bsRobertalarge_robertabase_SR_cefrj_lem_pos.tsv
===============   RESULTS  =============

MAP@1/Potential@1/Precision@1 = 0.7

MAP@3 = 0.3888
MAP@5 = 0.2413
MAP@10 = 0.1622

Potential@3 = 0.9
Potential@5 = 0.9
Potential@10 = 1.0

Accuracy@1@top_gold_1 = 0.3
Accuracy@2@top_gold_1 = 0.6
Accuracy@3@top_gold_1 = 0.6


### with pos tag of the substitutes taken into account:


If the lemmatized version of the substitute is found in the 'cefrj_all_treebank.tsv' file, and
If the POS tag of that word (as listed in 'cefrj_all_treebank.tsv') matches the POS tag of the substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [107]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

         
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrj_lem_possub_orig.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrj_lem_possub_orig exported to csv in path './predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrj_lem_possub_orig.tsv'\n")

ranked_cefr_subs: [('necessary', 2), ('optional', 4), ('voluntary', 5), ('conditional', 5), ('mandatory', 7), ('required', 7), ('mandated', 7), ('enforced', 7), ('obligatory', 7), ('forced', 7)]

ranked_cefr_subs: [('infused', 7), ('filled', 7), ('injected', 7), ('bolstered', 7), ('stirred', 7), ('inspired', 7), ('provided', 7), ('pumped', 7), ('seeded', 7), ('reassured', 7)]

ranked_cefr_subs: [('thugs', 7), ('nazis', 7), ('hawks', 7), ('monsters', 7), ('idiots', 7), ('terrorists', 7), ('criminals', 7), ('perpetrators', 7), ('creatures', 7), ('murderers', 7)]

ranked_cefr_subs: [('monitors', 7), ('inspectors', 7), ('investigators', 7), ('responders', 7), ('experts', 7), ('observations', 7), ('reporters', 7), ('witnesses', 7), ('supporters', 7), ('analysts', 7)]

ranked_cefr_subs: [('bullet', 7), ('stab', 7), ('gunshot', 7), ('projectile', 7), ('blast', 7), ('mortar', 7), ('shell', 7), ('grenade', 7), ('shotgun', 7), ('ammunition', 7)]

ranked_cefr_subs: [('masked', 7), ('concealed', 7

In [None]:
python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrj_lem_possub_orig.tsv --output_file ./output/trial/SS_bsRobertalarge_robertabase_SR_cefrj_lem_possub_orig.tsv

In [None]:
=========   EVALUATION config.=========
GOLD file = ./data/trial/tsar2022_en_trial_gold_no_noise.tsv
PREDICTION LABELS file = ./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrj_lem_possub_orig.tsv
OUTPUT file = ./output/trial/SS_bsRobertalarge_robertabase_SR_cefrj_lem_possub_orig.tsv
===============   RESULTS  =============

MAP@1/Potential@1/Precision@1 = 0.5

MAP@3 = 0.3166
MAP@5 = 0.22
MAP@10 = 0.1466

Potential@3 = 0.9
Potential@5 = 0.9
Potential@10 = 1.0

Accuracy@1@top_gold_1 = 0.3
Accuracy@2@top_gold_1 = 0.7
Accuracy@3@top_gold_1 = 0.7

### for model SG_MA_SS_bsRobertalarge_electralarge:


### lemmatized:

In [124]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# convert the CEFR dataframe into a dictionary for efficient lookups
cefr_dict = cefr_df.set_index('word')['cefr'].to_dict()

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    print(f"substitutes with CEFR levels mapped to numerical values: {substitutes_cefr}\n")

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked substitutes based on their CEFR level mapped to numerical values: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)




# read the predictions file into a dataframe
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrj_lem.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrj_lem exported to csv in path './predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrj_lem.tsv'}\n")


substitutes with CEFR levels mapped to numerical values: [('mandatory', 7), ('required', 7), ('mandated', 7), ('necessary', 2), ('essential', 3), ('voluntary', 5), ('optional', 4), ('prohibited', 7), ('free', 4), ('illegal', 2)]

ranked substitutes based on their CEFR level mapped to numerical values: [('necessary', 2), ('illegal', 2), ('essential', 3), ('optional', 4), ('free', 4), ('voluntary', 5), ('mandatory', 7), ('required', 7), ('mandated', 7), ('prohibited', 7)]

substitutes with CEFR levels mapped to numerical values: [('infused', 7), ('fed', 7), ('blessed', 7), ('impressed', 4), ('captured', 7), ('reinforced', 7), ('packed', 4), ('stunned', 7), ('surprised', 2), ('cultivated', 7)]

ranked substitutes based on their CEFR level mapped to numerical values: [('surprised', 2), ('impressed', 4), ('packed', 4), ('infused', 7), ('fed', 7), ('blessed', 7), ('captured', 7), ('reinforced', 7), ('stunned', 7), ('cultivated', 7)]

substitutes with CEFR levels mapped to numerical values: [

In [None]:
python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrj_lem.tsv --output_file ./output/trial/SS_bsRobertalarge_electralarge_SR_cefrj_lem.tsv

### with pos tag of the complex word taken into account:
((each lemmatized substitute is mapped to its CEFR level, and the specific CEFR level is included only if the part of speech (POS) of the word (in ./cefrj/cefrj_all_treebank.tsv) matches the POS of the complex word)

In [4]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# read the pos tags of complex words
complex_word_pos_df = pd.read_csv('./data/trial/trial_pos.tsv', sep='\t', header=None, names=['sentence', 'complex_word', 'pos'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    
    # Get the pos of the complex_word
    complex_word_pos = complex_word_pos_df[complex_word_pos_df['complex_word'] == complex_word]['pos'].values[0]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # if the lemmatized substitute word is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the complex word:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == complex_word_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrj_lem_pos.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrj_lem_pos exported to csv in path './predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrj_lem_pos.tsv'\n")

ranked_cefr_subs: [('free', 1), ('necessary', 2), ('illegal', 2), ('essential', 3), ('optional', 4), ('voluntary', 5), ('mandatory', 7), ('required', 7), ('mandated', 7), ('prohibited', 7)]

ranked_cefr_subs: [('infused', 7), ('fed', 7), ('blessed', 7), ('impressed', 7), ('captured', 7), ('reinforced', 7), ('packed', 7), ('stunned', 7), ('surprised', 7), ('cultivated', 7)]

ranked_cefr_subs: [('machines', 1), ('monsters', 3), ('operators', 4), ('devotees', 5), ('freaks', 7), ('thugs', 7), ('nazis', 7), ('minions', 7), ('criminals', 7), ('mania', 7)]

ranked_cefr_subs: [('reporters', 1), ('police', 2), ('monitors', 3), ('observations', 3), ('specialists', 3), ('diplomats', 4), ('witnesses', 4), ('analysts', 4), ('experts', 7), ('officials', 7)]

ranked_cefr_subs: [('small', 1), ('severe', 3), ('minor', 3), ('multiple', 4), ('superficial', 5), ('bullet', 7), ('stab', 7), ('gunshot', 7), ('knife', 7), ('stabbing', 7)]

ranked_cefr_subs: [('masked', 7), ('concealed', 7), ('clothed', 7), ('

In [None]:
python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrj_lem_pos.tsv --output_file ./output/trial/SS_bsRobertalarge_electralarge_SR_cefrj_lem_pos.tsv

In [None]:
=========   EVALUATION config.=========
GOLD file = ./data/trial/tsar2022_en_trial_gold_no_noise.tsv
PREDICTION LABELS file = ./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrj_lem_pos.tsv
OUTPUT file = ./output/trial/SS_bsRobertalarge_electralarge_SR_cefrj_lem_pos.tsv
===============   RESULTS  =============

MAP@1/Potential@1/Precision@1 = 0.6

MAP@3 = 0.3388
MAP@5 = 0.2813
MAP@10 = 0.175

Potential@3 = 0.7
Potential@5 = 0.8
Potential@10 = 0.9

Accuracy@1@top_gold_1 = 0.3
Accuracy@2@top_gold_1 = 0.4
Accuracy@3@top_gold_1 = 0.4

### with pos tag of the substitutes taken into account:


If the lemmatized version of the substitute is found in the 'cefrj_all_treebank.tsv' file, and
If the POS tag of that word (as listed in 'cefrj_all_treebank.tsv') matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [116]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

         
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrj_lem_possub_orig.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrj_lem_possub_orig exported to csv in path './predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrj_lem_possub_orig.tsv'\n")

ranked_cefr_subs: [('free', 1), ('necessary', 2), ('illegal', 2), ('essential', 3), ('optional', 4), ('voluntary', 5), ('mandatory', 7), ('required', 7), ('mandated', 7), ('prohibited', 7)]

ranked_cefr_subs: [('infused', 7), ('fed', 7), ('blessed', 7), ('impressed', 7), ('captured', 7), ('reinforced', 7), ('packed', 7), ('stunned', 7), ('surprised', 7), ('cultivated', 7)]

ranked_cefr_subs: [('freaks', 7), ('thugs', 7), ('nazis', 7), ('monsters', 7), ('devotees', 7), ('minions', 7), ('criminals', 7), ('operators', 7), ('mania', 7), ('machines', 7)]

ranked_cefr_subs: [('police', 2), ('monitors', 7), ('diplomats', 7), ('experts', 7), ('observations', 7), ('reporters', 7), ('witnesses', 7), ('officials', 7), ('specialists', 7), ('analysts', 7)]

ranked_cefr_subs: [('small', 1), ('severe', 3), ('minor', 3), ('multiple', 4), ('superficial', 5), ('bullet', 7), ('stab', 7), ('gunshot', 7), ('knife', 7), ('stabbing', 7)]

ranked_cefr_subs: [('masked', 7), ('concealed', 7), ('clothed', 7), ('

In [None]:
python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrj_lem_possub_orig.tsv --output_file ./output/trial/SS_bsRobertalarge_electralarge_SR_cefrj_lem_possub_orig.tsv

## for Uchida et al dataset:

### for model SG_MA_SS_bsRobertalarge_robertabase:

### lemmatized:

In [147]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr/uchida_pos.tsv', sep='\t', header=None, names=['word', 'pos','cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# convert the CEFR dataframe into a dictionary for efficient lookups
cefr_dict = cefr_df.set_index('word')['cefr'].to_dict()

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    #print(f"substitutes with CEFR levels mapped to numerical values: {substitutes_cefr}\n")

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked substitutes based on their CEFR level mapped to numerical values: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])
    
    
# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)

new_df.to_csv('./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefruchida_lem.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefruchida_lem exported to csv in path './predictions/trial/SS_bsRobertalarge_robertabase_SR_cefruchida_lem.tsv'\n")

ranked substitutes based on their CEFR level mapped to numerical values: [('necessary', 2), ('mandatory', 7), ('required', 7), ('mandated', 7), ('voluntary', 7), ('optional', 7), ('enforced', 7), ('conditional', 7), ('obligatory', 7), ('forced', 7)]

ranked substitutes based on their CEFR level mapped to numerical values: [('infused', 7), ('filled', 7), ('injected', 7), ('bolstered', 7), ('stirred', 7), ('inspired', 7), ('provided', 7), ('pumped', 7), ('seeded', 7), ('reassured', 7)]

ranked substitutes based on their CEFR level mapped to numerical values: [('creatures', 2), ('thugs', 7), ('nazis', 7), ('hawks', 7), ('monsters', 7), ('idiots', 7), ('terrorists', 7), ('criminals', 7), ('perpetrators', 7), ('murderers', 7)]

ranked substitutes based on their CEFR level mapped to numerical values: [('experts', 2), ('monitors', 7), ('inspectors', 7), ('investigators', 7), ('responders', 7), ('observations', 7), ('reporters', 7), ('witnesses', 7), ('supporters', 7), ('analysts', 7)]

ranked

In [None]:
python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefruchida_lem.tsv --output_file ./output/trial/SS_bsRobertalarge_robertabase_SR_cefruchida_lem.tsv

In [None]:
=========   EVALUATION config.=========
GOLD file = ./data/trial/tsar2022_en_trial_gold_no_noise.tsv
PREDICTION LABELS file = ./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefruchida_lem.tsv
OUTPUT file = ./output/trial/SS_bsRobertalarge_robertabase_SR_cefruchida_lem.tsv
===============   RESULTS  =============

MAP@1/Potential@1/Precision@1 = 0.6

MAP@3 = 0.3666
MAP@5 = 0.2369
MAP@10 = 0.1567

Potential@3 = 0.6
Potential@5 = 0.8
Potential@10 = 1.0

Accuracy@1@top_gold_1 = 0.3
Accuracy@2@top_gold_1 = 0.6
Accuracy@3@top_gold_1 = 0.6

### with pos tag of the complex word taken into account:
(each lemmatized substitute is mapped to its CEFR level, and the specific CEFR level is included only if the part of speech (POS) of the word (in ./cefrj/cefrj_all_treebank.tsv) matches the POS of the complex word)


In [139]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr/uchida_pos.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# read the pos tags of complex words
complex_word_pos_df = pd.read_csv('./data/trial/trial_pos.tsv', sep='\t', header=None, names=['sentence', 'complex_word', 'pos'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    
    # Get the pos of the complex_word
    complex_word_pos = complex_word_pos_df[complex_word_pos_df['complex_word'] == complex_word]['pos'].values[0]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # if the lemmatized substitute word is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the complex word:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == complex_word_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new TSV file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefruchida_lem_pos.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefruchida_lem_pos exported to csv in path './predictions/trial/SS_bsRobertalarge_robertabase_SR_cefruchida_lem_pos.tsv'\n")


ranked_cefr_subs: [('necessary', 2), ('mandatory', 7), ('required', 7), ('mandated', 7), ('voluntary', 7), ('optional', 7), ('enforced', 7), ('conditional', 7), ('obligatory', 7), ('forced', 7)]

ranked_cefr_subs: [('infused', 7), ('filled', 7), ('injected', 7), ('bolstered', 7), ('stirred', 7), ('inspired', 7), ('provided', 7), ('pumped', 7), ('seeded', 7), ('reassured', 7)]

ranked_cefr_subs: [('creatures', 2), ('thugs', 7), ('nazis', 7), ('hawks', 7), ('monsters', 7), ('idiots', 7), ('terrorists', 7), ('criminals', 7), ('perpetrators', 7), ('murderers', 7)]

ranked_cefr_subs: [('experts', 2), ('monitors', 7), ('inspectors', 7), ('investigators', 7), ('responders', 7), ('observations', 7), ('reporters', 7), ('witnesses', 7), ('supporters', 7), ('analysts', 7)]

ranked_cefr_subs: [('bullet', 7), ('stab', 7), ('gunshot', 7), ('projectile', 7), ('blast', 7), ('mortar', 7), ('shell', 7), ('grenade', 7), ('shotgun', 7), ('ammunition', 7)]

ranked_cefr_subs: [('masked', 7), ('concealed', 7

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefruchida_lem_pos.tsv --output_file ./output/trial/SS_bsRobertalarge_robertabase_SR_cefruchida_lem_pos.tsv

### with pos tag of the substitutes taken into account:


If the lemmatized version of the substitute is found in the './cefr/uchida_pos.tsv' file, and
If the POS tag of that word (as listed in './cefr/uchida_pos.tsv) matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [144]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr/uchida_pos.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match


    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])



# create a new dataframe from the new lists and write it to a new TSV file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefruchida_lem_possub_orig.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefruchida_lem_possub_orig exported to csv in path './predictions/trial/SS_bsRobertalarge_robertabase_SR_cefruchida_lem_possub_orig.tsv'\n")


ranked_cefr_subs: [('necessary', 2), ('mandatory', 7), ('required', 7), ('mandated', 7), ('voluntary', 7), ('optional', 7), ('enforced', 7), ('conditional', 7), ('obligatory', 7), ('forced', 7)]

ranked_cefr_subs: [('infused', 7), ('filled', 7), ('injected', 7), ('bolstered', 7), ('stirred', 7), ('inspired', 7), ('provided', 7), ('pumped', 7), ('seeded', 7), ('reassured', 7)]

ranked_cefr_subs: [('thugs', 7), ('nazis', 7), ('hawks', 7), ('monsters', 7), ('idiots', 7), ('terrorists', 7), ('criminals', 7), ('perpetrators', 7), ('creatures', 7), ('murderers', 7)]

ranked_cefr_subs: [('monitors', 7), ('inspectors', 7), ('investigators', 7), ('responders', 7), ('experts', 7), ('observations', 7), ('reporters', 7), ('witnesses', 7), ('supporters', 7), ('analysts', 7)]

ranked_cefr_subs: [('bullet', 7), ('stab', 7), ('gunshot', 7), ('projectile', 7), ('blast', 7), ('mortar', 7), ('shell', 7), ('grenade', 7), ('shotgun', 7), ('ammunition', 7)]

ranked_cefr_subs: [('masked', 7), ('concealed', 7

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefruchida_lem_possub_orig.tsv --output_file ./output/trial/SS_bsRobertalarge_robertabase_SR_cefruchida_lem_possub_orig.tsv

### for model SG_MA_SS_bsRobertalarge_electralarge:

### lemmatized:

In [141]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr/uchida_pos.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# convert the CEFR dataframe into a dictionary for efficient lookups
cefr_dict = cefr_df.set_index('word')['cefr'].to_dict()

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    print(f"substitutes with CEFR levels mapped to numerical values: {substitutes_cefr}\n")

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked substitutes based on their CEFR level mapped to numerical values: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)



# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefruchida_lem.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefruchida_lem.tsv exported to csv in path './predictions/trial/SS_bsRobertalarge_electralarge_SR_cefruchida_lem.tsv'\n")


substitutes with CEFR levels mapped to numerical values: [('mandatory', 7), ('required', 7), ('mandated', 7), ('necessary', 2), ('essential', 3), ('voluntary', 7), ('optional', 7), ('prohibited', 7), ('free', 7), ('illegal', 7)]

ranked substitutes based on their CEFR level mapped to numerical values: [('necessary', 2), ('essential', 3), ('mandatory', 7), ('required', 7), ('mandated', 7), ('voluntary', 7), ('optional', 7), ('prohibited', 7), ('free', 7), ('illegal', 7)]

substitutes with CEFR levels mapped to numerical values: [('infused', 7), ('fed', 7), ('blessed', 7), ('impressed', 7), ('captured', 7), ('reinforced', 7), ('packed', 7), ('stunned', 7), ('surprised', 7), ('cultivated', 7)]

ranked substitutes based on their CEFR level mapped to numerical values: [('infused', 7), ('fed', 7), ('blessed', 7), ('impressed', 7), ('captured', 7), ('reinforced', 7), ('packed', 7), ('stunned', 7), ('surprised', 7), ('cultivated', 7)]

substitutes with CEFR levels mapped to numerical values: [

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefruchida_lem.tsv --output_file ./output/trial/SS_bsRobertalarge_electralarge_SR_cefruchida_lem.tsv

### with pos tag of the complex word taken into account:
(each lemmatized substitute is mapped to its CEFR level, and the specific CEFR level is included only if the part of speech (POS) of the word (in ./cefrj/cefrj_all_treebank.tsv) matches the POS of the complex word)

In [145]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr/uchida_pos.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# read the pos tags of complex words
complex_word_pos_df = pd.read_csv('./data/trial/trial_pos.tsv', sep='\t', header=None, names=['sentence', 'complex_word', 'pos'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    
    # Get the pos of the complex_word
    complex_word_pos = complex_word_pos_df[complex_word_pos_df['complex_word'] == complex_word]['pos'].values[0]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # if the lemmatized substitute word is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the complex word:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == complex_word_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])


# Create a new dataframe from the new lists and write it to a new TSV file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefruchida_lem_pos.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefruchida_lem_pos exported to csv in path './predictions/trial/SS_bsRobertalarge_electralarge_SR_cefruchida_lem_pos.tsv'\n")

ranked_cefr_subs: [('necessary', 2), ('essential', 3), ('mandatory', 7), ('required', 7), ('mandated', 7), ('voluntary', 7), ('optional', 7), ('prohibited', 7), ('free', 7), ('illegal', 7)]

ranked_cefr_subs: [('infused', 7), ('fed', 7), ('blessed', 7), ('impressed', 7), ('captured', 7), ('reinforced', 7), ('packed', 7), ('stunned', 7), ('surprised', 7), ('cultivated', 7)]

ranked_cefr_subs: [('machines', 1), ('freaks', 7), ('thugs', 7), ('nazis', 7), ('monsters', 7), ('devotees', 7), ('minions', 7), ('criminals', 7), ('operators', 7), ('mania', 7)]

ranked_cefr_subs: [('experts', 2), ('specialists', 3), ('monitors', 7), ('diplomats', 7), ('observations', 7), ('reporters', 7), ('witnesses', 7), ('officials', 7), ('analysts', 7), ('police', 7)]

ranked_cefr_subs: [('minor', 3), ('multiple', 4), ('bullet', 7), ('stab', 7), ('gunshot', 7), ('knife', 7), ('stabbing', 7), ('severe', 7), ('superficial', 7), ('small', 7)]

ranked_cefr_subs: [('masked', 7), ('concealed', 7), ('clothed', 7), ('

In [None]:
python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefruchida_lem_pos.tsv --output_file ./output/trial/SS_bsRobertalarge_electralarge_SR_cefruchida_lem_pos.tsv

### with pos tag of the substitutes taken into account:


If the lemmatized version of the substitute is found in the './cefr/uchida_pos.tsv' file, and
If the POS tag of that word (as listed in './cefr/uchida_pos.tsv) matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [146]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr/uchida_pos.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
 
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])




# create a new dataframe from the new lists and write it to a new TSV file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefruchida_lem_possub_orig.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefruchida_lem_possub_orig exported to csv in path './predictions/trial/SS_bsRobertalarge_electralarge_SR_cefruchida_lem_possub_orig.tsv'\n")


ranked_cefr_subs: [('necessary', 2), ('essential', 3), ('mandatory', 7), ('required', 7), ('mandated', 7), ('voluntary', 7), ('optional', 7), ('prohibited', 7), ('free', 7), ('illegal', 7)]

ranked_cefr_subs: [('infused', 7), ('fed', 7), ('blessed', 7), ('impressed', 7), ('captured', 7), ('reinforced', 7), ('packed', 7), ('stunned', 7), ('surprised', 7), ('cultivated', 7)]

ranked_cefr_subs: [('freaks', 7), ('thugs', 7), ('nazis', 7), ('monsters', 7), ('devotees', 7), ('minions', 7), ('criminals', 7), ('operators', 7), ('mania', 7), ('machines', 7)]

ranked_cefr_subs: [('monitors', 7), ('diplomats', 7), ('experts', 7), ('observations', 7), ('reporters', 7), ('witnesses', 7), ('officials', 7), ('specialists', 7), ('analysts', 7), ('police', 7)]

ranked_cefr_subs: [('minor', 3), ('multiple', 4), ('bullet', 7), ('stab', 7), ('gunshot', 7), ('knife', 7), ('stabbing', 7), ('severe', 7), ('superficial', 7), ('small', 7)]

ranked_cefr_subs: [('masked', 7), ('concealed', 7), ('clothed', 7), ('

In [None]:
python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefruchida_lem_possub_orig.tsv --output_file ./output/trial/SS_bsRobertalarge_electralarge_SR_cefruchida_lem_possub_orig.tsv

### EFFLEX dataset: based on most frequent CEFR level:

### for model SG_MA_SS_bsRobertalarge_robertabase:

#### lemmatized:

In [82]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_mostfreq.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# convert the CEFR dataframe into a dictionary for efficient lookups
cefr_dict = cefr_df.set_index('word')['cefr'].to_dict()

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    #print(f"substitutes with CEFR levels mapped to numerical values: {substitutes_cefr}\n")

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked substitutes based on their CEFR level mapped to numerical values: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrefflex_mostfreq_lem.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrefflex_mostfreq exported to csv in path './predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrefflex_mostfreq_lem.tsv'\n")

ranked substitutes based on their CEFR level mapped to numerical values: [('voluntary', 4), ('optional', 4), ('mandatory', 5), ('necessary', 5), ('required', 7), ('mandated', 7), ('enforced', 7), ('conditional', 7), ('obligatory', 7), ('forced', 7)]

ranked substitutes based on their CEFR level mapped to numerical values: [('provided', 5), ('infused', 7), ('filled', 7), ('injected', 7), ('bolstered', 7), ('stirred', 7), ('inspired', 7), ('pumped', 7), ('seeded', 7), ('reassured', 7)]

ranked substitutes based on their CEFR level mapped to numerical values: [('monsters', 2), ('hawks', 3), ('idiots', 4), ('criminals', 4), ('murderers', 4), ('nazis', 5), ('terrorists', 5), ('creatures', 5), ('thugs', 7), ('perpetrators', 7)]

ranked substitutes based on their CEFR level mapped to numerical values: [('reporters', 1), ('inspectors', 2), ('observations', 3), ('witnesses', 4), ('monitors', 5), ('investigators', 5), ('responders', 5), ('experts', 5), ('supporters', 5), ('analysts', 5)]

ranked

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrefflex_mostfreq_lem.tsv --output_file ./output/trial/SS_bsRobertalarge_robertabase_SR_cefrefflex_mostfreq_lem.tsv

### with pos tag of the complex word taken into account:
(each lemmatized substitute is mapped to its CEFR level, and the specific CEFR level is included only if the part of speech (POS) of the word (in ./cefr_efllex/EFLLex_mostfreq.tsv) matches the POS of the complex word)

In [83]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_mostfreq.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5}

# read the pos tags of complex words
complex_word_pos_df = pd.read_csv('./data/trial/trial_pos.tsv', sep='\t', header=None, names=['sentence', 'complex_word', 'pos'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    
    # Get the pos of the complex_word
    complex_word_pos = complex_word_pos_df[complex_word_pos_df['complex_word'] == complex_word]['pos'].values[0]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # if the lemmatized substitute word is found in cefr_efllex/EFLLex.tsv AND the POS tag of that word (in cefr_efllex/EFLLex.tsv) is the same as the POS tag of the complex word:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == complex_word_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrefflex_mostfreq_lem_pos.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrefflex_mostfreq_lem_pos exported to csv in path './predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrefflex_mostfreq_lem_pos.tsv'\n")

ranked_cefr_subs: [('voluntary', 4), ('optional', 4), ('mandatory', 5), ('necessary', 5), ('required', 7), ('mandated', 7), ('enforced', 7), ('conditional', 7), ('obligatory', 7), ('forced', 7)]

ranked_cefr_subs: [('provided', 5), ('infused', 7), ('filled', 7), ('injected', 7), ('bolstered', 7), ('stirred', 7), ('inspired', 7), ('pumped', 7), ('seeded', 7), ('reassured', 7)]

ranked_cefr_subs: [('monsters', 2), ('hawks', 3), ('idiots', 4), ('terrorists', 4), ('murderers', 4), ('creatures', 5), ('thugs', 7), ('nazis', 7), ('criminals', 7), ('perpetrators', 7)]

ranked_cefr_subs: [('reporters', 1), ('inspectors', 2), ('observations', 3), ('witnesses', 4), ('investigators', 5), ('responders', 5), ('experts', 5), ('supporters', 5), ('analysts', 5), ('monitors', 7)]

ranked_cefr_subs: [('bullet', 7), ('stab', 7), ('gunshot', 7), ('projectile', 7), ('blast', 7), ('mortar', 7), ('shell', 7), ('grenade', 7), ('shotgun', 7), ('ammunition', 7)]

ranked_cefr_subs: [('masked', 7), ('concealed', 7

In [None]:
python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrefflex__mostfreq_lem_pos.tsv --output_file ./output/trial/SS_bsRobertalarge_robertabase_SR_cefrefflex_mostfreq_lem_pos.tsv

### with pos tag of the substitutes taken into account:


If the lemmatized version of the substitute is found in the ./cefr_efllex/EFLLex_mostfreq.tsv' file, and
If the POS tag of that word (as listed in './cefr_efllex/EFLLex_mostfreq.tsv) matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [112]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_mostfreq.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

         
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrefflex_mostfreq_lem_possub_orig.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrefflex_mostfreq_lem_possub_orig exported to csv in path './predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrefflex_mostfreq_lem_possub_orig.tsv'\n")

ranked_cefr_subs: [('voluntary', 4), ('optional', 4), ('mandatory', 5), ('necessary', 5), ('required', 7), ('mandated', 7), ('enforced', 7), ('conditional', 7), ('obligatory', 7), ('forced', 7)]

ranked_cefr_subs: [('infused', 7), ('filled', 7), ('injected', 7), ('bolstered', 7), ('stirred', 7), ('inspired', 7), ('provided', 7), ('pumped', 7), ('seeded', 7), ('reassured', 7)]

ranked_cefr_subs: [('hawks', 3), ('thugs', 7), ('nazis', 7), ('monsters', 7), ('idiots', 7), ('terrorists', 7), ('criminals', 7), ('perpetrators', 7), ('creatures', 7), ('murderers', 7)]

ranked_cefr_subs: [('monitors', 7), ('inspectors', 7), ('investigators', 7), ('responders', 7), ('experts', 7), ('observations', 7), ('reporters', 7), ('witnesses', 7), ('supporters', 7), ('analysts', 7)]

ranked_cefr_subs: [('bullet', 7), ('stab', 7), ('gunshot', 7), ('projectile', 7), ('blast', 7), ('mortar', 7), ('shell', 7), ('grenade', 7), ('shotgun', 7), ('ammunition', 7)]

ranked_cefr_subs: [('masked', 7), ('concealed', 7

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrefflex_mostfreq_lem_possub_orig.tsv --output_file ./output/trial/SS_bsRobertalarge_robertabase_SR_cefrefflex_mostfreq_lem_possub_orig.tsv

### for model SG_MA_SS_bsRobertalarge_electralarge:

#### lemmatized:

In [84]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_mostfreq.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# convert the CEFR dataframe into a dictionary for efficient lookups
cefr_dict = cefr_df.set_index('word')['cefr'].to_dict()

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    #print(f"substitutes with CEFR levels mapped to numerical values: {substitutes_cefr}\n")

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked substitutes based on their CEFR level mapped to numerical values: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrefflex_mostfreq_lem.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrefflex_mostfreq_lem exported to csv in path './predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrefflex_mostfreq_lem.tsv'\n")

ranked substitutes based on their CEFR level mapped to numerical values: [('free', 1), ('voluntary', 4), ('optional', 4), ('mandatory', 5), ('necessary', 5), ('essential', 5), ('illegal', 5), ('required', 7), ('mandated', 7), ('prohibited', 7)]

ranked substitutes based on their CEFR level mapped to numerical values: [('surprised', 3), ('impressed', 5), ('infused', 7), ('fed', 7), ('blessed', 7), ('captured', 7), ('reinforced', 7), ('packed', 7), ('stunned', 7), ('cultivated', 7)]

ranked substitutes based on their CEFR level mapped to numerical values: [('freaks', 2), ('monsters', 2), ('operators', 2), ('criminals', 4), ('nazis', 5), ('machines', 5), ('thugs', 7), ('devotees', 7), ('minions', 7), ('mania', 7)]

ranked substitutes based on their CEFR level mapped to numerical values: [('reporters', 1), ('police', 2), ('observations', 3), ('witnesses', 4), ('specialists', 4), ('monitors', 5), ('experts', 5), ('officials', 5), ('analysts', 5), ('diplomats', 7)]

ranked substitutes based 

In [None]:
python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrefflex__mostfreq_lem.tsv --output_file ./output/trial/SS_bsRobertalarge_electralarge_SR_cefrefflex_mostfreq_lem.tsv

### with pos tag of the complex word taken into account:
(each lemmatized substitute is mapped to its CEFR level, and the specific CEFR level is included only if the part of speech (POS) of the word (in './cefr_efllex/EFLLex_mostfreq) matches the POS of the complex word

In [85]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_mostfreq.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5}

# read the pos tags of complex words
complex_word_pos_df = pd.read_csv('./data/trial/trial_pos.tsv', sep='\t', header=None, names=['sentence', 'complex_word', 'pos'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    
    # Get the pos of the complex_word
    complex_word_pos = complex_word_pos_df[complex_word_pos_df['complex_word'] == complex_word]['pos'].values[0]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # if the lemmatized substitute word is found in cefr_efllex/EFLLex.tsv AND the POS tag of that word (in cefr_efllex/EFLLex.tsv) is the same as the POS tag of the complex word:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == complex_word_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrefflex_mostfreq_lem_pos.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrefflex_mostfreq_lem_pos exported to csv in path './predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrefflex_mostfreq_lem_pos.tsv'\n")

ranked_cefr_subs: [('voluntary', 4), ('optional', 4), ('mandatory', 5), ('necessary', 5), ('essential', 5), ('illegal', 5), ('required', 7), ('mandated', 7), ('prohibited', 7), ('free', 7)]

ranked_cefr_subs: [('infused', 7), ('fed', 7), ('blessed', 7), ('impressed', 7), ('captured', 7), ('reinforced', 7), ('packed', 7), ('stunned', 7), ('surprised', 7), ('cultivated', 7)]

ranked_cefr_subs: [('monsters', 2), ('operators', 2), ('freaks', 3), ('machines', 5), ('thugs', 7), ('nazis', 7), ('devotees', 7), ('minions', 7), ('criminals', 7), ('mania', 7)]

ranked_cefr_subs: [('reporters', 1), ('police', 2), ('observations', 3), ('witnesses', 4), ('specialists', 4), ('experts', 5), ('analysts', 5), ('monitors', 7), ('diplomats', 7), ('officials', 7)]

ranked_cefr_subs: [('small', 1), ('multiple', 3), ('severe', 5), ('superficial', 5), ('minor', 5), ('bullet', 7), ('stab', 7), ('gunshot', 7), ('knife', 7), ('stabbing', 7)]

ranked_cefr_subs: [('masked', 7), ('concealed', 7), ('clothed', 7), ('

In [None]:
python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrefflex_mostfreq_lem_pos.tsv --output_file ./output/trial/SS_bsRobertalarge_electralarge_SR_cefrefflex_mostfreq_lem_pos.tsv

In [None]:
=========   EVALUATION config.=========
GOLD file = ./data/trial/tsar2022_en_trial_gold_no_noise.tsv
PREDICTION LABELS file = ./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrefflex_lem.tsv
OUTPUT file = ./output/trial/SS_bsRobertalarge_electralarge_SR_cefrefflex_lem_pos.tsv
===============   RESULTS  =============

MAP@1/Potential@1/Precision@1 = 0.4

MAP@3 = 0.2055
MAP@5 = 0.2113
MAP@10 = 0.144

Potential@3 = 0.7
Potential@5 = 0.8
Potential@10 = 0.9

Accuracy@1@top_gold_1 = 0.1
Accuracy@2@top_gold_1 = 0.3
Accuracy@3@top_gold_1 = 0.4

#### with pos tag of substitutes taken into account:


If the lemmatized version of the substitute is found in the './cefr_efllex/EFLLex_mostfreq.tsv' file, and
If the POS tag of that word (as listed in ./cefr_efllex/EFLLex_mostfreq.tsv') matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [113]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_mostfreq.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

         
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrefflex_mostfreq_lem_possub_orig.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrefflex_mostfreq_lem_possub_orig exported to csv in path './predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrefflex_mostfreq_lem_possub_orig.tsv'\n")

ranked_cefr_subs: [('voluntary', 4), ('optional', 4), ('mandatory', 5), ('necessary', 5), ('essential', 5), ('illegal', 5), ('required', 7), ('mandated', 7), ('prohibited', 7), ('free', 7)]

ranked_cefr_subs: [('infused', 7), ('fed', 7), ('blessed', 7), ('impressed', 7), ('captured', 7), ('reinforced', 7), ('packed', 7), ('stunned', 7), ('surprised', 7), ('cultivated', 7)]

ranked_cefr_subs: [('freaks', 7), ('thugs', 7), ('nazis', 7), ('monsters', 7), ('devotees', 7), ('minions', 7), ('criminals', 7), ('operators', 7), ('mania', 7), ('machines', 7)]

ranked_cefr_subs: [('police', 2), ('monitors', 7), ('diplomats', 7), ('experts', 7), ('observations', 7), ('reporters', 7), ('witnesses', 7), ('officials', 7), ('specialists', 7), ('analysts', 7)]

ranked_cefr_subs: [('small', 1), ('multiple', 3), ('severe', 5), ('superficial', 5), ('minor', 5), ('bullet', 7), ('stab', 7), ('gunshot', 7), ('knife', 7), ('stabbing', 7)]

ranked_cefr_subs: [('masked', 7), ('concealed', 7), ('clothed', 7), ('

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrefflex_mostfreq_lem_possub_orig.tsv --output_file ./output/trial/SS_bsRobertalarge_electralarge_SR_cefrefflex_mostfreq_lem_possub_orig.tsv

### EFFLEX dataset: based on weighted average across CEFR levels:

### with pos tag of the substitutes taken into account:

If the lemmatized version of the substitute is found in the './cefr_efllex/EFLLex_weighted.tsv' file, and
If the POS tag of that word (as listed in './cefr_efllex/EFLLex.tsv') matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

### for model SG_MA_SS_bsRobertalarge_robertabase:

In [4]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_weighted.tsv', sep='\t', header=None, names=['word', 'pos', 'Weighted CEFR'])

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its weighted CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its weighted CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in EFLLex_weighted.tsv AND the POS tag of that word (in EFLLex_weighted.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['Weighted CEFR'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

    # sort the substitutes based on their weighted CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print (f"substitutes_cefr ranked on weighted average: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrefflex_weighted_lem_possub_orig.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrefflex_weighted_lem_possub_orig exported to csv in path './predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrefflex_weighted_lem_possub_orig.tsv'\n")


substitutes_cefr ranked on weighted average: [('optional', 4.0), ('voluntary', 4.030806623720507), ('necessary', 4.195970238761346), ('mandatory', 5.0), ('required', 7), ('mandated', 7), ('enforced', 7), ('conditional', 7), ('obligatory', 7), ('forced', 7)]

substitutes_cefr ranked on weighted average: [('infused', 7), ('filled', 7), ('injected', 7), ('bolstered', 7), ('stirred', 7), ('inspired', 7), ('provided', 7), ('pumped', 7), ('seeded', 7), ('reassured', 7)]

substitutes_cefr ranked on weighted average: [('hawks', 3.0), ('thugs', 7), ('nazis', 7), ('monsters', 7), ('idiots', 7), ('terrorists', 7), ('criminals', 7), ('perpetrators', 7), ('creatures', 7), ('murderers', 7)]

substitutes_cefr ranked on weighted average: [('monitors', 7), ('inspectors', 7), ('investigators', 7), ('responders', 7), ('experts', 7), ('observations', 7), ('reporters', 7), ('witnesses', 7), ('supporters', 7), ('analysts', 7)]

substitutes_cefr ranked on weighted average: [('bullet', 7), ('stab', 7), ('guns

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrefflex_weighted_lem_possub_orig.tsv --output_file ./output/trial/SS_bsRobertalarge_robertabase_SR_cefrefflex_weighted_lem_possub_orig.tsv

### for model SG_MA_SS_bsRobertalarge_electralarge:

In [5]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_weighted.tsv', sep='\t', header=None, names=['word', 'pos', 'Weighted CEFR'])

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its weighted CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its weighted CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in EFLLex_weighted.tsv AND the POS tag of that word (in EFLLex_weighted.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['Weighted CEFR'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

    # sort the substitutes based on their weighted CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print (f"substitutes_cefr ranked on weighted average: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrefflex_weighted_lem_possub_orig.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrefflex_weighted_lem_possub_orig exported to csv in path './predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrefflex_weighted_lem_possub_orig.tsv'\n")


substitutes_cefr ranked on weighted average: [('optional', 4.0), ('voluntary', 4.030806623720507), ('necessary', 4.195970238761346), ('illegal', 4.309478791176403), ('essential', 4.922895620647522), ('mandatory', 5.0), ('required', 7), ('mandated', 7), ('prohibited', 7), ('free', 7)]

substitutes_cefr ranked on weighted average: [('infused', 7), ('fed', 7), ('blessed', 7), ('impressed', 7), ('captured', 7), ('reinforced', 7), ('packed', 7), ('stunned', 7), ('surprised', 7), ('cultivated', 7)]

substitutes_cefr ranked on weighted average: [('freaks', 7), ('thugs', 7), ('nazis', 7), ('monsters', 7), ('devotees', 7), ('minions', 7), ('criminals', 7), ('operators', 7), ('mania', 7), ('machines', 7)]

substitutes_cefr ranked on weighted average: [('police', 2.822945866591008), ('monitors', 7), ('diplomats', 7), ('experts', 7), ('observations', 7), ('reporters', 7), ('witnesses', 7), ('officials', 7), ('specialists', 7), ('analysts', 7)]

substitutes_cefr ranked on weighted average: [('small

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrefflex_weighted_lem_possub_orig.tsv --output_file ./output/trial/SS_bsRobertalarge_electralarge_SR_cefrefflex_weighted_lem_possub_orig.tsv

### EVP dataset: 

#### lemmatized:

#### for model SG_MA_SS_bsRobertalarge_robertabase: no changes (Dataset contains a lot of phrases iso words)

In [88]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_evp/evp_american.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# convert the CEFR dataframe into a dictionary for efficient lookups
cefr_dict = cefr_df.set_index(['word', 'pos'])['cefr'].to_dict()

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)



# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    print(f"substitutes with CEFR levels mapped to numerical values: {substitutes_cefr}\n")

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked substitutes based on their CEFR level mapped to numerical values: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])
    
    

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrevp_lem.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrevp_lem exported to csv in path './predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrevp_lem.tsv'\n")

substitutes with CEFR levels mapped to numerical values: [('mandatory', 7), ('required', 7), ('mandated', 7), ('necessary', 7), ('voluntary', 7), ('optional', 7), ('enforced', 7), ('conditional', 7), ('obligatory', 7), ('forced', 7)]

ranked substitutes based on their CEFR level mapped to numerical values: [('mandatory', 7), ('required', 7), ('mandated', 7), ('necessary', 7), ('voluntary', 7), ('optional', 7), ('enforced', 7), ('conditional', 7), ('obligatory', 7), ('forced', 7)]

substitutes with CEFR levels mapped to numerical values: [('infused', 7), ('filled', 7), ('injected', 7), ('bolstered', 7), ('stirred', 7), ('inspired', 7), ('provided', 7), ('pumped', 7), ('seeded', 7), ('reassured', 7)]

ranked substitutes based on their CEFR level mapped to numerical values: [('infused', 7), ('filled', 7), ('injected', 7), ('bolstered', 7), ('stirred', 7), ('inspired', 7), ('provided', 7), ('pumped', 7), ('seeded', 7), ('reassured', 7)]

substitutes with CEFR levels mapped to numerical val

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrevp_lem.tsv --output_file ./output/trial/SS_bsRobertalarge_robertabase_SR_cefrevp_lem.tsv

#### with complex word pos:
(each lemmatized substitute is mapped to its CEFR level, and the specific CEFR level is included only if the part of speech (POS) of the word (in ./cefrj/cefrj_all_treebank.tsv) matches the POS of the complex word)

In [87]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_evp/evp_american_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# read the pos tags of complex words
complex_word_pos_df = pd.read_csv('./data/trial/trial_pos.tsv', sep='\t', header=None, names=['sentence', 'complex_word', 'pos'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    
    # Get the pos of the complex_word
    complex_word_pos = complex_word_pos_df[complex_word_pos_df['complex_word'] == complex_word]['pos'].values[0]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # if the lemmatized substitute word is found in './cefr_evp/evp_american_treebank.tsv AND the POS tag of that word ('./cefr_evp/evp_american_treebank.tsv) is the same as the POS tag of the complex word:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == complex_word_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])
    

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_pos.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrevp_lem_pos exported to csv in path './predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_pos.tsv'\n")

ranked_cefr_subs: [('necessary', 3), ('optional', 4), ('voluntary', 5), ('obligatory', 5), ('mandatory', 7), ('required', 7), ('mandated', 7), ('enforced', 7), ('conditional', 7), ('forced', 7)]

ranked_cefr_subs: [('filled', 3), ('provided', 3), ('inspired', 4), ('reassured', 5), ('stirred', 6), ('infused', 7), ('injected', 7), ('bolstered', 7), ('pumped', 7), ('seeded', 7)]

ranked_cefr_subs: [('monsters', 3), ('criminals', 3), ('murderers', 3), ('idiots', 4), ('terrorists', 4), ('thugs', 7), ('nazis', 7), ('hawks', 7), ('perpetrators', 7), ('creatures', 7)]

ranked_cefr_subs: [('experts', 3), ('reporters', 3), ('monitors', 4), ('inspectors', 4), ('investigators', 4), ('observations', 4), ('witnesses', 4), ('analysts', 4), ('responders', 7), ('supporters', 7)]

ranked_cefr_subs: [('bullet', 7), ('stab', 7), ('gunshot', 7), ('projectile', 7), ('blast', 7), ('mortar', 7), ('shell', 7), ('grenade', 7), ('shotgun', 7), ('ammunition', 7)]

ranked_cefr_subs: [('wrapped', 4), ('masked', 7),

In [None]:
python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_pos.tsv --output_file ./output/trial/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_pos.tsv

#### with pos of substitutes:


If the lemmatized version of the substitute is found in the './cefr_evp/evp_american_treebank.tsv' file, and
If the POS tag of that word (as listed in './cefr_evp/evp_american_treebank.tsv') matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [114]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_evp/evp_american_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

         
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_possub_orig.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrevp_lem_possub_orig exported to csv in path './predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_possub_orig.tsv'\n")

ranked_cefr_subs: [('necessary', 3), ('optional', 4), ('voluntary', 5), ('obligatory', 5), ('mandatory', 7), ('required', 7), ('mandated', 7), ('enforced', 7), ('conditional', 7), ('forced', 7)]

ranked_cefr_subs: [('infused', 7), ('filled', 7), ('injected', 7), ('bolstered', 7), ('stirred', 7), ('inspired', 7), ('provided', 7), ('pumped', 7), ('seeded', 7), ('reassured', 7)]

ranked_cefr_subs: [('thugs', 7), ('nazis', 7), ('hawks', 7), ('monsters', 7), ('idiots', 7), ('terrorists', 7), ('criminals', 7), ('perpetrators', 7), ('creatures', 7), ('murderers', 7)]

ranked_cefr_subs: [('monitors', 7), ('inspectors', 7), ('investigators', 7), ('responders', 7), ('experts', 7), ('observations', 7), ('reporters', 7), ('witnesses', 7), ('supporters', 7), ('analysts', 7)]

ranked_cefr_subs: [('bullet', 7), ('stab', 7), ('gunshot', 7), ('projectile', 7), ('blast', 7), ('mortar', 7), ('shell', 7), ('grenade', 7), ('shotgun', 7), ('ammunition', 7)]

ranked_cefr_subs: [('masked', 7), ('concealed', 7

In [None]:
python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_possub_orig.tsv --output_file ./output/trial/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_possub_orig.tsv

#### for model SG_MA_SS_bsRobertalarge_electralarge: no changes (dataset contains a lot of phrases iso words).

### lemmatized:

In [89]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_evp/evp_american.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# convert the CEFR dataframe into a dictionary for efficient lookups
cefr_dict = cefr_df.set_index(['word', 'pos'])['cefr'].to_dict()

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)



# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    #print(f"substitutes with CEFR levels mapped to numerical values: {substitutes_cefr}\n")

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked substitutes based on their CEFR level mapped to numerical values: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])
    
    

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrevp_lem.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrevp_lem exported to csv in path './predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrevp_lem.tsv'\n")

ranked substitutes based on their CEFR level mapped to numerical values: [('mandatory', 7), ('required', 7), ('mandated', 7), ('necessary', 7), ('essential', 7), ('voluntary', 7), ('optional', 7), ('prohibited', 7), ('free', 7), ('illegal', 7)]

ranked substitutes based on their CEFR level mapped to numerical values: [('infused', 7), ('fed', 7), ('blessed', 7), ('impressed', 7), ('captured', 7), ('reinforced', 7), ('packed', 7), ('stunned', 7), ('surprised', 7), ('cultivated', 7)]

ranked substitutes based on their CEFR level mapped to numerical values: [('freaks', 7), ('thugs', 7), ('nazis', 7), ('monsters', 7), ('devotees', 7), ('minions', 7), ('criminals', 7), ('operators', 7), ('mania', 7), ('machines', 7)]

ranked substitutes based on their CEFR level mapped to numerical values: [('monitors', 7), ('diplomats', 7), ('experts', 7), ('observations', 7), ('reporters', 7), ('witnesses', 7), ('officials', 7), ('specialists', 7), ('analysts', 7), ('police', 7)]

ranked substitutes based 

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial//SS_bsRobertalarge_electralarge_SR_cefrevp_lem.tsv --output_file ./output/trial//SS_bsRobertalarge_electralarge_SR_cefrevp_lem.tsv

#### with complex word pos:
(each lemmatized substitute is mapped to its CEFR level, and the specific CEFR level is included only if the part of speech (POS) of the word (in ./cefrj/cefrj_all_treebank.tsv) matches the POS of the complex word)

In [90]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_evp/evp_american_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# read the pos tags of complex words
complex_word_pos_df = pd.read_csv('./data/trial/trial_pos.tsv', sep='\t', header=None, names=['sentence', 'complex_word', 'pos'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    
    # Get the pos of the complex_word
    complex_word_pos = complex_word_pos_df[complex_word_pos_df['complex_word'] == complex_word]['pos'].values[0]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # if the lemmatized substitute word is found in './cefr_evp/evp_american_treebank.tsv AND the POS tag of that word ('./cefr_evp/evp_american_treebank.tsv) is the same as the POS tag of the complex word:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == complex_word_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])
    

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrevp_lem_pos.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrevp_lem_pos exported to csv in path './predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrevp_lem_pos.tsv'\n")

ranked_cefr_subs: [('free', 2), ('necessary', 3), ('essential', 3), ('optional', 4), ('illegal', 4), ('voluntary', 5), ('mandatory', 7), ('required', 7), ('mandated', 7), ('prohibited', 7)]

ranked_cefr_subs: [('captured', 4), ('reinforced', 5), ('infused', 7), ('fed', 7), ('blessed', 7), ('impressed', 7), ('packed', 7), ('stunned', 7), ('surprised', 7), ('cultivated', 7)]

ranked_cefr_subs: [('machines', 2), ('monsters', 3), ('criminals', 3), ('operators', 4), ('freaks', 7), ('thugs', 7), ('nazis', 7), ('devotees', 7), ('minions', 7), ('mania', 7)]

ranked_cefr_subs: [('police', 2), ('experts', 3), ('reporters', 3), ('monitors', 4), ('diplomats', 4), ('observations', 4), ('witnesses', 4), ('specialists', 4), ('analysts', 4), ('officials', 7)]

ranked_cefr_subs: [('small', 1), ('severe', 4), ('minor', 4), ('multiple', 5), ('superficial', 6), ('bullet', 7), ('stab', 7), ('gunshot', 7), ('knife', 7), ('stabbing', 7)]

ranked_cefr_subs: [('wrapped', 4), ('masked', 7), ('concealed', 7), ('

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrevp_lem_pos.tsv --output_file ./output/trial/SS_bsRobertalarge_electralarge_SR_cefrevp_lem_pos.tsv

#### with pos of substitutes:


If the lemmatized version of the substitute is found in the './cefr_evp/evp_american_treebank.tsv' file, and
If the POS tag of that word (as listed in './cefr_evp/evp_american_treebank.tsv') matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [115]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_evp/evp_american_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

         
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrevp_lem_possub_orig.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrevp_lem_possub_orig exported to csv in path './predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrevp_lem_possub_orig.tsv'\n")

ranked_cefr_subs: [('free', 2), ('necessary', 3), ('essential', 3), ('optional', 4), ('illegal', 4), ('voluntary', 5), ('mandatory', 7), ('required', 7), ('mandated', 7), ('prohibited', 7)]

ranked_cefr_subs: [('infused', 7), ('fed', 7), ('blessed', 7), ('impressed', 7), ('captured', 7), ('reinforced', 7), ('packed', 7), ('stunned', 7), ('surprised', 7), ('cultivated', 7)]

ranked_cefr_subs: [('freaks', 7), ('thugs', 7), ('nazis', 7), ('monsters', 7), ('devotees', 7), ('minions', 7), ('criminals', 7), ('operators', 7), ('mania', 7), ('machines', 7)]

ranked_cefr_subs: [('police', 2), ('monitors', 7), ('diplomats', 7), ('experts', 7), ('observations', 7), ('reporters', 7), ('witnesses', 7), ('officials', 7), ('specialists', 7), ('analysts', 7)]

ranked_cefr_subs: [('small', 1), ('severe', 4), ('minor', 4), ('multiple', 5), ('superficial', 6), ('bullet', 7), ('stab', 7), ('gunshot', 7), ('knife', 7), ('stabbing', 7)]

ranked_cefr_subs: [('masked', 7), ('concealed', 7), ('clothed', 7), ('

In [None]:
python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_bsRobertalarge_electralarge_SR_cefrevp_lem_possub_orig.tsv --output_file ./output/trial/SS_bsRobertalarge_electralarge_SR_cefrevp_lem_possub_orig.tsv