## test set:

### parse sentences and obtain PoS for complex word in new file

In [2]:
import csv
from nltk import pos_tag, word_tokenize

def get_treebank_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'JJ'
    elif treebank_tag.startswith('V'):
        return 'VB'
    elif treebank_tag.startswith('N'):
        return 'NN'
    elif treebank_tag.startswith('R'):
        return 'RB'
    else:
        return ''

# open input file
with open('./data/test/tsar2022_en_test_none_no_noise.tsv', 'r', encoding='utf-8') as infile:
    reader = csv.reader(infile, delimiter='\t')

    # open output file
    with open('./data/test/test_pos.tsv', 'w', encoding='utf-8') as outfile:
        writer = csv.writer(outfile, delimiter='\t')
        
        # loop over rows in the input file
        for row in reader:
            sentence = row[0]
            complex_word = row[1]

            # tokenize sentence and get POS tags
            tagged_sentence = pos_tag(word_tokenize(sentence))

            # find the POS of the complex word and write the sentence, complex word, and the pos tag of the complex word to the output file
            for word, pos in tagged_sentence:
                if word.lower() == complex_word.lower():
                    # Get the Treebank POS and write to the output file
                    treebank_pos = get_treebank_pos(pos)
                    writer.writerow([sentence, complex_word, treebank_pos])
                    break

## for CEFR-J dataset:

### for model SG_MA_SS_bsRobertalarge_robertabase:

### lemmatized:

In [25]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all.tsv', sep='\t', header=None, names=['word', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# convert the CEFR dataframe into a dictionary for efficient lookups
cefr_dict = cefr_df.set_index('word')['cefr'].to_dict()

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    #print(f"substitutes with CEFR levels mapped to numerical values: {substitutes_cefr}\n")

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked substitutes based on their CEFR level mapped to numerical values: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrj_lem.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrj_lem exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrj_lem.tsv'\n")



SS_bsRobertalarge_robertabase_SR_cefrj_lem exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrj_lem.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrj_lem.tsv --output_file ./output/test/SS_bsRobertalarge_robertabase_SR_cefrj_lem.tsv

In [None]:
=========   EVALUATION config.=========
GOLD file = ./data/test/tsar2022_en_test_gold_no_noise.tsv
PREDICTION LABELS file = ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrj_lem.tsv
OUTPUT file = ./output/test/SS_bsRobertalarge_robertabase_SR_cefrj_lem.tsv
===============   RESULTS  =============

MAP@1/Potential@1/Precision@1 = 0.6236

MAP@3 = 0.4301
MAP@5 = 0.3268
MAP@10 = 0.2036

Potential@3 = 0.8467
Potential@5 = 0.9247
Potential@10 = 0.9677

Accuracy@1@top_gold_1 = 0.2661
Accuracy@2@top_gold_1 = 0.4059
Accuracy@3@top_gold_1 = 0.4784

### with pos tag of the complex word taken into account:
(each lemmatized substitute is mapped to its CEFR level and included only if the part of speech (POS) of the word (in ./cefrj/cefrj_all_treebank.tsv) matches the POS of the complex word)

In [5]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# read the pos tags of complex words
complex_word_pos_df = pd.read_csv('./data/test/test_pos.tsv', sep='\t', header=None, names=['sentence', 'complex_word', 'pos'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    
    # Get the pos of the complex_word
    complex_word_pos = complex_word_pos_df[complex_word_pos_df['complex_word'] == complex_word]['pos'].values[0]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # if the lemmatized substitute word is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the complex word:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == complex_word_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrj_lem_pos.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrj_lem_pos exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrj_lem_pos.tsv'\n")


SS_bsRobertalarge_robertabase_SR_cefrj_lem_pos exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrj_lem_pos.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrj_lem_pos.tsv --output_file ./output/test/SS_bsRobertalarge_robertabase_SR_cefrj_lem_pos.tsv

### with pos tag of the substitutes taken into account:

If the lemmatized version of the substitute is found in the 'cefrj_all_treebank.tsv' file, and
If the POS tag of that word (as listed in 'cefrj_all_treebank.tsv') matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [11]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrj_lem_possub_orig.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrj_lem_possub_orig exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrj_lem_possub_orig.tsv'\n")

SS_bsRobertalarge_robertabase_SR_cefrj_lem_possub_orig exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrj_lem_possub_orig.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrj_lem_possub_orig.tsv --output_file ./output/test/SS_bsRobertalarge_robertabase_SR_cefrj_lem_possub_orig.tsv

### for model SG_MA_SS_bsRobertalarge_electralarge:


### lemmatized:

In [6]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all.tsv', sep='\t', header=None, names=['word', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# convert the CEFR dataframe into a dictionary for efficient lookups
cefr_dict = cefr_df.set_index('word')['cefr'].to_dict()

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    #print(f"substitutes with CEFR levels mapped to numerical values: {substitutes_cefr}\n")

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked substitutes based on their CEFR level mapped to numerical values: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)




# read the predictions file into a dataframe
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrj_lem.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrj_lem exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrj_lem.tsv'}\n")


SS_bsRobertalarge_electralarge_SR_cefrj_lem exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrj_lem.tsv'}



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrj_lem.tsv --output_file ./output/test/SS_bsRobertalarge_electralarge_SR_cefrj_lem.tsv

### with pos tag of the complex word taken into account:
(each lemmatized substitute is mapped to its CEFR level and included only if the part of speech (POS) of the word (in ./cefrj/cefrj_all_treebank.tsv) matches the POS of the complex word)

In [7]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# read the pos tags of complex words
complex_word_pos_df = pd.read_csv('./data/test/test_pos.tsv', sep='\t', header=None, names=['sentence', 'complex_word', 'pos'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    
    # Get the pos of the complex_word
    complex_word_pos = complex_word_pos_df[complex_word_pos_df['complex_word'] == complex_word]['pos'].values[0]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # if the lemmatized substitute word is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the complex word:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == complex_word_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrj_lem_pos.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrj_lem_pos exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrj_lem_pos.tsv'\n")

SS_bsRobertalarge_electralarge_SR_cefrj_lem_pos exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrj_lem_pos.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrj_lem_pos.tsv --output_file ./output/test/SS_bsRobertalarge_electralarge_SR_cefrj_lem_pos.tsv

In [None]:
=========   EVALUATION config.=========
GOLD file = ./data/test/tsar2022_en_test_gold_no_noise.tsv
PREDICTION LABELS file = ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrj_lem_pos.tsv
OUTPUT file = ./output/test/SS_bsRobertalarge_electralarge_SR_cefrj_lem_pos.tsv
===============   RESULTS  =============

MAP@1/Potential@1/Precision@1 = 0.4677

MAP@3 = 0.3267
MAP@5 = 0.2558
MAP@10 = 0.1758

Potential@3 = 0.7741
Potential@5 = 0.8655
Potential@10 = 0.9462

Accuracy@1@top_gold_1 = 0.1908
Accuracy@2@top_gold_1 = 0.3064
Accuracy@3@top_gold_1 = 0.3978

### with pos tag of the substitutes taken into account:


If the lemmatized version of the substitute is found in the 'cefrj_all_treebank.tsv' file, and
If the POS tag of that word (as listed in 'cefrj_all_treebank.tsv') matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [13]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

         
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrj_lem_possub_orig.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrj_lem_possub_orig exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrj_lem_possub_orig.tsv'\n")

SS_bsRobertalarge_electralarge_SR_cefrj_lem_possub_orig exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrj_lem_possub_orig.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrj_lem_possub_orig.tsv --output_file ./output/test/SS_bsRobertalarge_electralarge_SR_cefrj_lem_possub_orig.tsv

## for Uchida et al dataset (parsed sentences first in datafiles_changes.ipynb):

### for model SG_MA_SS_bsRobertalarge_robertabase:

### lemmatized:

In [34]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr/uchida_pos.tsv', sep='\t', header=None, names=['word', 'pos','cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# convert the CEFR dataframe into a dictionary for efficient lookups
cefr_dict = cefr_df.set_index('word')['cefr'].to_dict()

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    #print(f"substitutes with CEFR levels mapped to numerical values: {substitutes_cefr}\n")

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked substitutes based on their CEFR level mapped to numerical values: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])
    
    
# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)

new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lem.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefruchida_lem exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lem.tsv'\n")

SS_bsRobertalarge_robertabase_SR_cefruchida_lem exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lem.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lem.tsv --output_file ./output/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lem.tsv

### with pos tag of the complex word taken into account:
(each lemmatized substitute is mapped to its CEFR level, and the specific CEFR level is included only if the part of speech (POS) of the word (in ./cefrj/cefrj_all_treebank.tsv) matches the POS of the complex word)



In [35]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr/uchida_pos.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# read the pos tags of complex words
complex_word_pos_df = pd.read_csv('./data/test/test_pos.tsv', sep='\t', header=None, names=['sentence', 'complex_word', 'pos'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    
    # Get the pos of the complex_word
    complex_word_pos = complex_word_pos_df[complex_word_pos_df['complex_word'] == complex_word]['pos'].values[0]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # if the lemmatized substitute word is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the complex word:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == complex_word_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])


# create a new dataframe from the new lists and write it to a new TSV file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lem_pos.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefruchida_lem_pos exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lem_pos.tsv'\n")


SS_bsRobertalarge_robertabase_SR_cefruchida_lem_pos exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lem_pos.tsv'



python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lem_pos.tsv --output_file ./output/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lem_pos.tsv

### with pos tag of the substitutes taken into account:

If the lemmatized version of the substitute is found in the './cefr/uchida_pos.tsv' file, and
If the POS tag of that word (as listed in './cefr/uchida_pos.tsv) matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [36]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr/uchida_pos.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match


    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])


# create a new dataframe from the new lists and write it to a new TSV file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lem_possub_orig.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefruchida_lem_possub_orig exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lem_possub_orig.tsv'\n")

SS_bsRobertalarge_robertabase_SR_cefruchida_lem_possub_orig exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lem_possub_orig.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lem_possub_orig.tsv --output_file ./output/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lem_possub_orig.tsv

### for model SG_MA_SS_bsRobertalarge_electralarge:

### lemmatized:

In [37]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr/uchida_pos.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# convert the CEFR dataframe into a dictionary for efficient lookups
cefr_dict = cefr_df.set_index('word')['cefr'].to_dict()

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    #print(f"substitutes with CEFR levels mapped to numerical values: {substitutes_cefr}\n")

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked substitutes based on their CEFR level mapped to numerical values: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])


# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefruchida_lem.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefruchida_lem.tsv exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefruchida_lem.tsv'\n")


SS_bsRobertalarge_electralarge_SR_cefruchida_lem.tsv exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefruchida_lem.tsv'



python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefruchida_lem.tsv --output_file ./output/test/SS_bsRobertalarge_electralarge_SR_cefruchida_lem.tsv

### with pos tag of the complex word taken into account:
(each lemmatized substitute is mapped to its CEFR level and included only if the part of speech (POS) of the word (in cefr/uchida_pos.tsv) matches the POS of the complex word)

In [38]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr/uchida_pos.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# read the pos tags of complex words
complex_word_pos_df = pd.read_csv('./data/test/test_pos.tsv', sep='\t', header=None, names=['sentence', 'complex_word', 'pos'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    
    # Get the pos of the complex_word
    complex_word_pos = complex_word_pos_df[complex_word_pos_df['complex_word'] == complex_word]['pos'].values[0]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # if the lemmatized substitute word is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the complex word:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == complex_word_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])


# Create a new dataframe from the new lists and write it to a new TSV file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefruchida_lem_pos.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefruchida_lem_pos exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefruchida_lem_pos.tsv'\n")

SS_bsRobertalarge_electralarge_SR_cefruchida_lem_pos exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefruchida_lem_pos.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefruchida_lem_pos.tsv --output_file ./output/test/SS_bsRobertalarge_electralarge_SR_cefruchida_lem_pos.tsv

### with pos tag of the substitutes taken into account:


If the lemmatized version of the substitute is found in the './cefr/uchida_pos.tsv' file, and
If the POS tag of that word (as listed in './cefr/uchida_pos.tsv) matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [39]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr/uchida_pos.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
 
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])



# create a new dataframe from the new lists and write it to a new TSV file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefruchida_lem_possub_orig.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefruchida_lem_possub_orig exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefruchida_lem_possub_orig.tsv'\n")


SS_bsRobertalarge_electralarge_SR_cefruchida_lem_possub_orig exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefruchida_lem_possub_orig.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefruchida_lem_possub_orig.tsv --output_file ./output/test/SS_bsRobertalarge_electralarge_SR_cefruchida_lem_possub_orig.tsv

### EFFLEX dataset:

### for model SG_MA_SS_bsRobertalarge_robertabase:

#### lemmatized:

In [14]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# convert the CEFR dataframe into a dictionary for efficient lookups
cefr_dict = cefr_df.set_index('word')['cefr'].to_dict()

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    #print(f"substitutes with CEFR levels mapped to numerical values: {substitutes_cefr}\n")

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked substitutes based on their CEFR level mapped to numerical values: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrefflex_lem.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrefflex_lem exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrefflex_lem.tsv'\n")

SS_bsRobertalarge_robertabase_SR_cefrefflex_lem exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrefflex_lem.tsv'



python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrefflex_lem.tsv --output_file ./output/test/SS_bsRobertalarge_robertabase_SR_cefrefflex_lem.tsv

### with pos tag of the complex word taken into account:
(each lemmatized substitute is mapped to its CEFR level and included only if the part of speech (POS) of the word (in ./cefrj/cefrj_all_treebank.tsv) matches the POS of the complex word)

In [16]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5}

# read the pos tags of complex words
complex_word_pos_df = pd.read_csv('./data/test/test_pos.tsv', sep='\t', header=None, names=['sentence', 'complex_word', 'pos'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    
    # Get the pos of the complex_word
    complex_word_pos = complex_word_pos_df[complex_word_pos_df['complex_word'] == complex_word]['pos'].values[0]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # if the lemmatized substitute word is found in cefr_efllex/EFLLex.tsv AND the POS tag of that word (in cefr_efllex/EFLLex.tsv) is the same as the POS tag of the complex word:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == complex_word_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrefflex_lem_pos.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrefflex_lem_pos exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrefflex_lem_pos.tsv'\n")

SS_bsRobertalarge_robertabase_SR_cefrefflex_lem_pos exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrefflex_lem_pos.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrefflex_lem_pos.tsv --output_file ./output/test/SS_bsRobertalarge_robertabase_SR_cefrefflex_lem_pos.tsv

### with pos tag of the substitutes taken into account:


If the lemmatized version of the substitute is found in the './cefr_efllex/EFLLex.tsv' file, and
If the POS tag of that word (as listed in './cefr_efllex/EFLLex.tsv') matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [16]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

         
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrefflex_lem_possub_orig.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrefflex_lem_possub_orig exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrefflex_lem_possub_orig.tsv'\n")

SS_bsRobertalarge_robertabase_SR_cefrefflex_lem_possub_orig exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrefflex_lem_possub_orig.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrefflex_lem_possub_orig.tsv --output_file ./output/test/SS_bsRobertalarge_robertabase_SR_cefrefflex_lem_possub_orig.tsv

### for model SG_MA_SS_bsRobertalarge_electralarge:

#### lemmatized:

In [18]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# convert the CEFR dataframe into a dictionary for efficient lookups
cefr_dict = cefr_df.set_index('word')['cefr'].to_dict()

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    #print(f"substitutes with CEFR levels mapped to numerical values: {substitutes_cefr}\n")

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked substitutes based on their CEFR level mapped to numerical values: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrefflex_lem.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrefflex_lem exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrefflex_lem.tsv'\n")

SS_bsRobertalarge_electralarge_SR_cefrefflex_lem exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrefflex_lem.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrefflex_lem.tsv --output_file ./output/test/SS_bsRobertalarge_electralarge_SR_cefrefflex_lem.tsv

=========   EVALUATION config.=========
GOLD file = ./data/test/tsar2022_en_test_gold_no_noise.tsv
PREDICTION LABELS file = ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrefflex_lem.tsv
OUTPUT file = ./output/test/SS_bsRobertalarge_electralarge_SR_cefrefflex_lem.tsv
===============   RESULTS  =============

MAP@1/Potential@1/Precision@1 = 0.3709

MAP@3 = 0.274
MAP@5 = 0.2394
MAP@10 = 0.1661

Potential@3 = 0.7043
Potential@5 = 0.8655
Potential@10 = 0.9462

Accuracy@1@top_gold_1 = 0.1129
Accuracy@2@top_gold_1 = 0.2473
Accuracy@3@top_gold_1 = 0.3548

### with pos tag of the complex word taken into account:
(each lemmatized substitute is mapped to its CEFR level and included only if the part of speech (POS) of the word (in ./cefrj/cefrj_all_treebank.tsv) matches the POS of the complex word)

In [19]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5}

# read the pos tags of complex words
complex_word_pos_df = pd.read_csv('./data/test/test_pos.tsv', sep='\t', header=None, names=['sentence', 'complex_word', 'pos'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    
    # Get the pos of the complex_word
    complex_word_pos = complex_word_pos_df[complex_word_pos_df['complex_word'] == complex_word]['pos'].values[0]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # if the lemmatized substitute word is found in cefr_efllex/EFLLex.tsv AND the POS tag of that word (in cefr_efllex/EFLLex.tsv) is the same as the POS tag of the complex word:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == complex_word_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrefflex_lem_pos.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrefflex_lem_pos exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrefflex_lem_pos.tsv'\n")

SS_bsRobertalarge_electralarge_SR_cefrefflex_lem_pos exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrefflex_lem_pos.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrefflex_lem_pos.tsv --output_file ./output/test/SS_bsRobertalarge_electralarge_SR_cefrefflex_lem_pos.tsv

#### with pos tag of substitutes taken into account:


If the lemmatized version of the substitute is found in the './cefr_efllex/EFLLex.tsv' file, and
If the POS tag of that word (as listed in './cefr_efllex/EFLLex.tsv') matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [17]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

         
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrefflex_lem_possub_orig.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrefflex_lem_possub_orig exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrefflex_lem_possub_orig.tsv'\n")

SS_bsRobertalarge_electralarge_SR_cefrefflex_lem_possub_orig exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrefflex_lem_possub_orig.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrefflex_lem_possub_orig.tsv --output_file ./output/test/SS_bsRobertalarge_electralarge_SR_cefrefflex_lem_possub_orig.tsv

### EVP dataset: 

#### lemmatized:

#### for model SG_MA_SS_bsRobertalarge_robertabase: no changes (Dataset contains a lot of phrases iso words)

In [21]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_evp/evp_american.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# convert the CEFR dataframe into a dictionary for efficient lookups
cefr_dict = cefr_df.set_index(['word', 'pos'])['cefr'].to_dict()

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)



# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    #print(f"substitutes with CEFR levels mapped to numerical values: {substitutes_cefr}\n")

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked substitutes based on their CEFR level mapped to numerical values: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])
    
    

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrevp_lem.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrevp_lem exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrevp_lem.tsv'\n")

SS_bsRobertalarge_robertabase_SR_cefrevp_lem exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrevp_lem.tsv'



python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrevp_lem.tsv --output_file ./output/test/SS_bsRobertalarge_robertabase_SR_cefrevp_lem.tsv

#### with complex word pos:

In [22]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_evp/evp_american_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# read the pos tags of complex words
complex_word_pos_df = pd.read_csv('./data/test/test_pos.tsv', sep='\t', header=None, names=['sentence', 'complex_word', 'pos'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    
    # Get the pos of the complex_word
    complex_word_pos = complex_word_pos_df[complex_word_pos_df['complex_word'] == complex_word]['pos'].values[0]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # if the lemmatized substitute word is found in './cefr_evp/evp_american_treebank.tsv AND the POS tag of that word ('./cefr_evp/evp_american_treebank.tsv) is the same as the POS tag of the complex word:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == complex_word_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])
    

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_pos.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrevp_lem_pos exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_pos.tsv'\n")

SS_bsRobertalarge_robertabase_SR_cefrevp_lem_pos exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_pos.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_pos.tsv --output_file ./output/test/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_pos.tsv

### with pos tag of the substitutes taken into account:


If the lemmatized version of the substitute is found in the './cefr_evp/evp_american_treebank.tsv' file, and
If the POS tag of that word (as listed in './cefr_evp/evp_american_treebank.tsv') matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [18]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_evp/evp_american_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

         
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_possub_orig.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrevp_lem_possub_orig exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_possub_orig.tsv'\n")

SS_bsRobertalarge_robertabase_SR_cefrevp_lem_possub_orig exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_possub_orig.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_possub_orig.tsv --output_file ./output/test/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_possub_orig.tsv

=========   EVALUATION config.=========
GOLD file = ./data/test/tsar2022_en_test_gold_no_noise.tsv
PREDICTION LABELS file = ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_possub_orig.tsv
OUTPUT file = ./output/test/SS_bsRobertalarge_robertabase_SR_cefrevp_lem_possub_orig.tsv
===============   RESULTS  =============

MAP@1/Potential@1/Precision@1 = 0.5134

MAP@3 = 0.3461
MAP@5 = 0.2754
MAP@10 = 0.1842

Potential@3 = 0.7741
Potential@5 = 0.9032
Potential@10 = 0.9677

Accuracy@1@top_gold_1 = 0.1962
Accuracy@2@top_gold_1 = 0.3306
Accuracy@3@top_gold_1 = 0.4274

#### for model SG_MA_SS_bsRobertalarge_electralarge: no changes (dataset contains a lot of phrases iso words).

### lemmatized:

In [23]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_evp/evp_american.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# convert the CEFR dataframe into a dictionary for efficient lookups
cefr_dict = cefr_df.set_index(['word', 'pos'])['cefr'].to_dict()

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)



# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    #print(f"substitutes with CEFR levels mapped to numerical values: {substitutes_cefr}\n")

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked substitutes based on their CEFR level mapped to numerical values: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])
    
    

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrevp_lem.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrevp_lem exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrevp_lem.tsv'\n")

SS_bsRobertalarge_electralarge_SR_cefrevp_lem exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrevp_lem.tsv'



python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test//SS_bsRobertalarge_electralarge_SR_cefrevp_lem.tsv --output_file ./output/test//SS_bsRobertalarge_electralarge_SR_cefrevp_lem.tsv

#### with complex word pos:

In [24]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_evp/evp_american_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# read the pos tags of complex words
complex_word_pos_df = pd.read_csv('./data/test/test_pos.tsv', sep='\t', header=None, names=['sentence', 'complex_word', 'pos'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    
    # Get the pos of the complex_word
    complex_word_pos = complex_word_pos_df[complex_word_pos_df['complex_word'] == complex_word]['pos'].values[0]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # if the lemmatized substitute word is found in './cefr_evp/evp_american_treebank.tsv AND the POS tag of that word ('./cefr_evp/evp_american_treebank.tsv) is the same as the POS tag of the complex word:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == complex_word_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])
    

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrevp_lem_pos.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrevp_lem_pos exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrevp_lem_pos.tsv'\n")

SS_bsRobertalarge_electralarge_SR_cefrevp_lem_pos exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrevp_lem_pos.tsv'



python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrevp_lem_pos.tsv --output_file ./output/test/SS_bsRobertalarge_electralarge_SR_cefrevp_lem_pos.tsv

### with pos tag of the substitutes taken into account:

If the lemmatized version of the substitute is found in the './cefr_evp/evp_american_treebank.tsv' file, and
If the POS tag of that word (as listed in './cefr_evp/evp_american_treebank.tsv') matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [19]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_evp/evp_american_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, lemmatizer.lemmatize(sub)) for sub in substitutes]

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmatized:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

         
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrevp_lem_possub_orig.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrevp_lem_possub_orig exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrevp_lem_possub_orig.tsv'\n")

SS_bsRobertalarge_electralarge_SR_cefrevp_lem_possub_orig exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrevp_lem_possub_orig.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrevp_lem_possub_orig.tsv --output_file ./output/test/SS_bsRobertalarge_electralarge_SR_cefrevp_lem_possub_orig.tsv