## trial set: Substitute Ranking (SR) step with CEFR levels:
#### Performed on best 3 models after SS step

In [2]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

In [3]:
from nltk.corpus import wordnet as wn
import spacy
nlp = spacy.load("en_core_web_sm")

In [4]:
# map spaCy PoS tags to WordNet PoS tags
def map_pos_spacy_wordnet(pos_spacy):
    pos_map = {
        'NOUN': wn.NOUN,
        'VERB': wn.VERB,
        'ADJ': wn.ADJ,
        'ADV': wn.ADV
    }
    return pos_map.get(pos_spacy, wn.NOUN) # default to NOUN if pos_spacy does not exist in the dict


### for model SS_phase2_option2bHyps2first_robertabase (No. 1 ranked after SS step):

#### for CEFR-J dataset (SR_option2a):
If the lemmatized version of the substitute is found in the 'cefrj_all_treebank.tsv' file, and
If the POS tag of that word (as listed in 'cefrj_all_treebank.tsv') matches the POS tag of the substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [22]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option2bHyps2first_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    # print(f"complex_word: {complex_word}\n")
    #print(f"substitutes: {substitutes}\n")
    
    
    # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
    #print(f"substitutes_cefr: {substitutes_cefr}\n")
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"Substitute Ranking (SR), option 2: substitutes with cefr level ranked first: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no1_SR_option2aCEFR_J_robertabase.tsv', sep='\t', index=False, header=False)
print("SS_no1_SR_option2aCEFR_J_robertabase exported to csv in path './predictions/trial/SS_no1_SR_option2aCEFR_J_robertabase.tsv'\n")

SS_no1_SR_option2aCEFR_J_robertabase exported to csv in path './predictions/trial/SS_no1_SR_option2aCEFR_J_robertabase.tsv'



python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no1_SR_option2aCEFR_J_robertabase.tsv --output_file ./output/trial/SS_no1_SR_option2aCEFR_J_robertabase.tsv

## for Uchida et al. (CEFR-LS) dataset (SR_option2b):
If the lemmatized version of the substitute is found in the './cefr/uchida_pos.tsv' file, and
If the POS tag of that word (as listed in './cefr/uchida_pos.tsv) matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [5]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_ls/uchida_pos.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option2bHyps2first_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

     # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match


    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])



# create a new dataframe from the new lists and write it to a new TSV file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no1_SR_option2bCEFR_ls_robertabase.tsv', sep='\t', index=False, header=False)
print("SS_no1_SR_option2bCEFR_ls_robertabase exported to csv in path './predictions/trial/SS_no1_SR_option2bCEFR_ls_robertabase.tsv'\n")


SS_no1_SR_option2bCEFR_ls_robertabase exported to csv in path './predictions/trial/SS_no1_SR_option2bCEFR_ls_robertabase.tsv'



python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no1_SR_option2bCEFR_ls_robertabase.tsv --output_file ./output/trial/SS_no1_SR_option2bCEFR_ls_robertabase.tsv

### EFFLEX dataset: based on most frequent CEFR level (SR_option2c):
If the lemmatized version of the substitute is found in the ./cefr_efllex/EFLLex_mostfreq.tsv' file, and
If the POS tag of that word (as listed in './cefr_efllex/EFLLex_mostfreq.tsv) matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [6]:

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_mostfreq.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option2bHyps2first_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

     # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    
    

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

         
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no1_SR_option2cCEFR_efl_mostfreq_robertabase.tsv', sep='\t', index=False, header=False)
print("SS_no1_SR_option2cCEFR_efl_mostfreq_robertabase exported to csv in path './predictions/trial/SS_no1_SR_option2cCEFR_efl_mostfreq_robertabase.tsv'\n")

SS_no1_SR_option2cCEFR_efl_mostfreq_robertabase exported to csv in path './predictions/trial/SS_no1_SR_option2cCEFR_efl_mostfreq_robertabase.tsv'



python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no1_SR_option2cCEFR_efl_mostfreq_robertabase.tsv --output_file ./output/trial/SS_no1_SR_option2cCEFR_efl_mostfreq_robertabase.tsv

### EFFLEX dataset: based on weighted average across CEFR levels (SR_option2d):

In [7]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_weighted.tsv', sep='\t', header=None, names=['word', 'pos', 'Weighted CEFR'])

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option2bHyps2first_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its weighted CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

     # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    

    # map each lemmatized substitute to its weighted CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in EFLLex_weighted.tsv AND the POS tag of that word (in EFLLex_weighted.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['Weighted CEFR'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

    # sort the substitutes based on their weighted CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print (f"substitutes_cefr ranked on weighted average: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no1_SR_option2dCEFR_efl_weigthed_robertabase.tsv', sep='\t', index=False, header=False)
print("SS_no1_SR_option2dCEFR_efl_weigthed_robertabase exported to csv in path './predictions/trial/SS_no1_SR_option2dCEFR_efl_weigthed_robertabase'\n")

SS_no1_SR_option2dCEFR_efl_weigthed_robertabase exported to csv in path './predictions/trial/SS_no1_SR_option2dCEFR_efl_weigthed_robertabase'



python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no1_SR_option2dCEFR_efl_weigthed_robertabase.tsv --output_file ./output/trial/SS_no1_SR_option2dCEFR_efl_weigthed_robertabase.tsv

### all CEFR datasets combined: unique word-postag groups with duplicate CEFR scores have been averaged (SR_option2e):
code includes averages of substitutes not found in the combined CEFR database.

In [8]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_all/cefr_all_combined.tsv', sep='\t', header=None, names=['word', 'pos', 'Weighted CEFR'])

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option2bHyps2first_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its weighted CEFR level, sort them, and save them into a new list
predictions_cefr = []
count_7 = 0  # counter of substitutes with level 7
total_rows = 0  # counter of total number of rows
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    
     # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    

    # map each lemmatized substitute to its weighted CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in './cefr_all/cefr_all_combined.tsv' AND the POS tag of that word (in './cefr_all/cefr_all_combined.tsv') is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['Weighted CEFR'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
            count_7 += 1  # add 1 to the counter per substitute with level 7

    total_rows += 1  # add 1 to the total number of rows

    # sort the substitutes based on their weighted CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print (f"substitutes_cefr ranked on weighted average: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no1_SR_option2eCEFR_all_robertabase.tsv', sep='\t', index=False, header=False)
print("SS_no1_SR_option2eCEFR_all_robertabase exported to csv in path './predictions/trial/SS_no1_SR_option2eCEFR_all_robertabase'\n")

# calculate the average number of substitutes with level 7
average_7 = count_7 / total_rows
# print(f"On average, there were {average_7} substitutes of the provided 10 substitutes that had not been found in the combined CEFR dataset.")

SS_no1_SR_option2eCEFR_all_robertabase exported to csv in path './predictions/trial/SS_no1_SR_option2eCEFR_all_robertabase'



python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no1_SR_option2eCEFR_all_robertabase.tsv --output_file ./output/trial/SS_no1_SR_option2eCEFR_all_robertabase.tsv

### for model SS_phase2_option1Synsfirst_robertabase (No. 2 ranked after SS step):

#### for CEFR-J dataset (SR_option2a):
If the lemmatized version of the substitute is found in the 'cefrj_all_treebank.tsv' file, and
If the POS tag of that word (as listed in 'cefrj_all_treebank.tsv') matches the POS tag of the substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [9]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option1Synsfirst_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    # print(f"complex_word: {complex_word}\n")
    #print(f"substitutes: {substitutes}\n")
    
    
    # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
    #print(f"substitutes_cefr: {substitutes_cefr}\n")
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"Substitute Ranking (SR), option 2: substitutes with cefr level ranked first: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no2_SR_option2aCEFR_J_robertabase.tsv', sep='\t', index=False, header=False)
print("SS_no2_SR_option2aCEFR_J_robertabase exported to csv in path './predictions/trial/SS_no2_SR_option2aCEFR_J_robertabase.tsv'\n")

SS_no2_SR_option2aCEFR_J_robertabase exported to csv in path './predictions/trial/SS_no2_SR_option2aCEFR_J_robertabase.tsv'



python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no2_SR_option2aCEFR_J_robertabase.tsv --output_file ./output/trial/SS_no2_SR_option2aCEFR_J_robertabase.tsv

## for Uchida et al. (CEFR-LS) dataset (SR_option2b):
If the lemmatized version of the substitute is found in the './cefr/uchida_pos.tsv' file, and
If the POS tag of that word (as listed in './cefr/uchida_pos.tsv) matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [10]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_ls/uchida_pos.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option1Synsfirst_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

     # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match


    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])



# create a new dataframe from the new lists and write it to a new TSV file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no2_SR_option2bCEFR_ls_robertabase.tsv', sep='\t', index=False, header=False)
print("SS_no2_SR_option2bCEFR_ls_robertabase exported to csv in path './predictions/trial/SS_no2_SR_option2bCEFR_ls_robertabase.tsv'\n")


SS_no2_SR_option2bCEFR_ls_robertabase exported to csv in path './predictions/trial/SS_no2_SR_option2bCEFR_ls_robertabase.tsv'



python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no2_SR_option2bCEFR_ls_robertabase.tsv --output_file ./output/trial/SS_no2_SR_option2bCEFR_ls_robertabase.tsv

### EFFLEX dataset: based on most frequent CEFR level (SR_option2c):
If the lemmatized version of the substitute is found in the ./cefr_efllex/EFLLex_mostfreq.tsv' file, and
If the POS tag of that word (as listed in './cefr_efllex/EFLLex_mostfreq.tsv) matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [11]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_mostfreq.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option1Synsfirst_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

     # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    
    

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

         
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no2_SR_option2cCEFR_efl_mostfreq_robertabase.tsv', sep='\t', index=False, header=False)
print("SS_no2_SR_option2cCEFR_efl_weighted_robertabase exported to csv in path './predictions/trial/SS_no2_SR_option2cCEFR_efl_weighted_robertabase.tsv'\n")

SS_no2_SR_option2cCEFR_efl_weighted_robertabase exported to csv in path './predictions/trial/SS_no2_SR_option2cCEFR_efl_weighted_robertabase.tsv'



python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no2_SR_option2cCEFR_efl_mostfreq_robertabase.tsv --output_file ./output/trial/SS_no2_SR_option2cCEFR_efl_mostfreq_robertabase.tsv

### EFFLEX dataset: based on weighted average across CEFR levels (SR_option2d):

In [12]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_weighted.tsv', sep='\t', header=None, names=['word', 'pos', 'Weighted CEFR'])

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option1Synsfirst_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its weighted CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

     # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    

    # map each lemmatized substitute to its weighted CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in EFLLex_weighted.tsv AND the POS tag of that word (in EFLLex_weighted.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['Weighted CEFR'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

    # sort the substitutes based on their weighted CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print (f"substitutes_cefr ranked on weighted average: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no2_SR_option2dCEFR_efl_weigthed_robertabase.tsv', sep='\t', index=False, header=False)
print("SS_no2_SR_option2dCEFR_efl_weigthed_robertabase exported to csv in path './predictions/trial/SS_no2_SR_option2dCEFR_efl_weigthed_robertabase'\n")

SS_no2_SR_option2dCEFR_efl_weigthed_robertabase exported to csv in path './predictions/trial/SS_no2_SR_option2dCEFR_efl_weigthed_robertabase'



python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no2_SR_option2dCEFR_efl_weigthed_robertabase.tsv --output_file ./output/trial/SS_no2_SR_option2dCEFR_efl_weigthed_robertabase.tsv

### all CEFR datasets combined: unique word-postag groups with duplicate CEFR scores have been averaged (SR_option 2e):
code includes averages of substitutes not found in the combined CEFR database.

In [13]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_all/cefr_all_combined.tsv', sep='\t', header=None, names=['word', 'pos', 'Weighted CEFR'])

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option1Synsfirst_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its weighted CEFR level, sort them, and save them into a new list
predictions_cefr = []
count_7 = 0  # counter of substitutes with level 7
total_rows = 0  # counter of total number of rows
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    
     # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    

    # map each lemmatized substitute to its weighted CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in './cefr_all/cefr_all_combined.tsv' AND the POS tag of that word (in './cefr_all/cefr_all_combined.tsv') is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['Weighted CEFR'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
            count_7 += 1  # add 1 to the counter per substitute with level 7

    total_rows += 1  # add 1 to the total number of rows

    # sort the substitutes based on their weighted CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print (f"substitutes_cefr ranked on weighted average: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no2_SR_option2eCEFR_all_robertabase.tsv', sep='\t', index=False, header=False)
print("SS_no2_SR_option2eCEFR_all_robertabase exported to csv in path './predictions/trial/SS_no2_SR_option2eCEFR_all_robertabase'\n")

# calculate the average number of substitutes with level 7
average_7 = count_7 / total_rows
# print(f"On average, there were {average_7} substitutes of the provided 10 substitutes that had not been found in the combined CEFR dataset.")

SS_no2_SR_option2eCEFR_all_robertabase exported to csv in path './predictions/trial/SS_no2_SR_option2eCEFR_all_robertabase'



python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no2_SR_option2eCEFR_all_robertabase.tsv --output_file ./output/trial/SS_no2_SR_option2eCEFR_all_robertabase.tsv

### for model SS_phase2_option3f_BSrobertalarge_robertabase (No. 3 ranked after SS step):

#### for CEFR-J dataset (SR_option2a):
If the lemmatized version of the substitute is found in the 'cefrj_all_treebank.tsv' file, and
If the POS tag of that word (as listed in 'cefrj_all_treebank.tsv') matches the POS tag of the substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [14]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option3f_BSrobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    # print(f"complex_word: {complex_word}\n")
    #print(f"substitutes: {substitutes}\n")
    
    
    # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
    #print(f"substitutes_cefr: {substitutes_cefr}\n")
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"Substitute Ranking (SR), option 2: substitutes with cefr level ranked first: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no3_SR_option2aCEFR_J_robertabase.tsv', sep='\t', index=False, header=False)
print("SS_no3_SR_option2aCEFR_J_robertabase exported to csv in path './predictions/trial/SS_no3_SR_option2aCEFR_J_robertabase.tsv'\n")

SS_no3_SR_option2aCEFR_J_robertabase exported to csv in path './predictions/trial/SS_no3_SR_option2aCEFR_J_robertabase.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no3_SR_option2aCEFR_J_robertabase.tsv --output_file ./output/trial/SS_no3_SR_option2aCEFR_J_robertabase.tsv

#### for Uchida et al. (CEFR-LS) dataset (SR_option2b):
If the lemmatized version of the substitute is found in the './cefr/uchida_pos.tsv' file, and
If the POS tag of that word (as listed in './cefr/uchida_pos.tsv) matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [15]:

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_ls/uchida_pos.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option3f_BSrobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

     # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match


    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])



# create a new dataframe from the new lists and write it to a new TSV file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no3_SR_option2bCEFR_ls_robertabase.tsv', sep='\t', index=False, header=False)
print("SS_no3_SR_option2bCEFR_ls_robertabase exported to csv in path './predictions/trial/SS_no3_SR_option2bCEFR_ls_robertabase.tsv'\n")


SS_no3_SR_option2bCEFR_ls_robertabase exported to csv in path './predictions/trial/SS_no3_SR_option2bCEFR_ls_robertabase.tsv'



python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no3_SR_option2bCEFR_ls_robertabase.tsv --output_file ./output/trial/SS_no3_SR_option2bCEFR_ls_robertabase.tsv

#### EFFLEX dataset: based on most frequent CEFR level (SR_option2c):
If the lemmatized version of the substitute is found in the ./cefr_efllex/EFLLex_mostfreq.tsv' file, and
If the POS tag of that word (as listed in './cefr_efllex/EFLLex_mostfreq.tsv) matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [16]:

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_mostfreq.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option3f_BSrobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

     # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    
    

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

         
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no3_SR_option2cCEFR_efl_mostfreq_robertabase.tsv', sep='\t', index=False, header=False)
print("SS_no3_SR_option2cCEFR_efl_mostfreq_robertabase exported to csv in path './predictions/trial/SS_no3_SR_option2cCEFR_efl_mostfreq_robertabase.tsv'\n")

SS_no3_SR_option2cCEFR_efl_mostfreq_robertabase exported to csv in path './predictions/trial/SS_no3_SR_option2cCEFR_efl_mostfreq_robertabase.tsv'



python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no3_SR_option2cCEFR_efl_mostfreq_robertabase.tsv --output_file ./output/trial/SS_no3_SR_option2cCEFR_efl_mostfreq_robertabase.tsv

#### EFFLEX dataset: based on weighted average across CEFR levels (SR_option2d):

In [17]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_weighted.tsv', sep='\t', header=None, names=['word', 'pos', 'Weighted CEFR'])

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option3f_BSrobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its weighted CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

     # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    

    # map each lemmatized substitute to its weighted CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in EFLLex_weighted.tsv AND the POS tag of that word (in EFLLex_weighted.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['Weighted CEFR'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

    # sort the substitutes based on their weighted CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print (f"substitutes_cefr ranked on weighted average: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no3_SR_option2dCEFR_efl_weigthed_robertabase.tsv', sep='\t', index=False, header=False)
print("SS_no3_SR_option2dCEFR_efl_weigthed_robertabase exported to csv in path './predictions/trial/SS_no3_SR_option2dCEFR_efl_weigthed_robertabase'\n")


SS_no3_SR_option2dCEFR_efl_weigthed_robertabase exported to csv in path './predictions/trial/SS_no3_SR_option2dCEFR_efl_weigthed_robertabase'



python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no3_SR_option2dCEFR_efl_weigthed_robertabase.tsv --output_file ./output/trial/SS_no3_SR_option2dCEFR_efl_weigthed_robertabase.tsv

### all CEFR datasets combined: unique word-postag groups with duplicate CEFR scores have been averaged (SR_option 2e):
code includes averages of substitutes not found in the combined CEFR database.

In [18]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_all/cefr_all_combined.tsv', sep='\t', header=None, names=['word', 'pos', 'Weighted CEFR'])

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option3f_BSrobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its weighted CEFR level, sort them, and save them into a new list
predictions_cefr = []
count_7 = 0  # counter of substitutes with level 7
total_rows = 0  # counter of total number of rows
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    
     # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    

    # map each lemmatized substitute to its weighted CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in './cefr_all/cefr_all_combined.tsv' AND the POS tag of that word (in './cefr_all/cefr_all_combined.tsv') is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['Weighted CEFR'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
            count_7 += 1  # add 1 to the counter per substitute with level 7

    total_rows += 1  # add 1 to the total number of rows

    # sort the substitutes based on their weighted CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print (f"substitutes_cefr ranked on weighted average: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no3_SR_option2eCEFR_all_robertabase.tsv', sep='\t', index=False, header=False)
print("SS_no3_SR_option2eCEFR_all_robertabase exported to csv in path './predictions/trial/SS_no3_SR_option2eCEFR_all_robertabase'\n")

# calculate the average number of substitutes with level 7
average_7 = count_7 / total_rows
# print(f"On average, there were {average_7} substitutes of the provided 10 substitutes that had not been found in the combined CEFR dataset.")

SS_no3_SR_option2eCEFR_all_robertabase exported to csv in path './predictions/trial/SS_no3_SR_option2eCEFR_all_robertabase'



python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no3_SR_option2eCEFR_all_robertabase.tsv --output_file ./output/trial/SS_no3_SR_option2eCEFR_all_robertabase.tsv

### for model SS_phase2_option3f_BSrobertalarge_electralarge (No. 4 ranked after SS step):

#### for CEFR-J dataset (SR_option2a):
If the lemmatized version of the substitute is found in the 'cefrj_all_treebank.tsv' file, and
If the POS tag of that word (as listed in 'cefrj_all_treebank.tsv') matches the POS tag of the substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [26]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option3f_BSrobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    # print(f"complex_word: {complex_word}\n")
    #print(f"substitutes: {substitutes}\n")
    
    
    # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
    #print(f"substitutes_cefr: {substitutes_cefr}\n")
         

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"Substitute Ranking (SR), option 2: substitutes with cefr level ranked first: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no4_SR_option2aCEFR_J_electralarge.tsv', sep='\t', index=False, header=False)
print("SS_no4_SR_option2aCEFR_J_electralarge exported to csv in path './predictions/trial/SS_no4_SR_option2aCEFR_J_electralarge.tsv'\n")

SS_no4_SR_option2aCEFR_J_electralarge exported to csv in path './predictions/trial/SS_no4_SR_option2aCEFR_J_electralarge.tsv'



python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no4_SR_option2aCEFR_J_electralarge.tsv --output_file ./output/trial/SS_no4_SR_option2aCEFR_J_electralarge.tsv

## for Uchida et al. (CEFR-LS) dataset (SR_option2b):
If the lemmatized version of the substitute is found in the './cefr/uchida_pos.tsv' file, and
If the POS tag of that word (as listed in './cefr/uchida_pos.tsv) matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [27]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_ls/uchida_pos.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option3f_BSrobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

     # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match


    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])



# create a new dataframe from the new lists and write it to a new TSV file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no4_SR_option2bCEFR_ls_electralarge.tsv', sep='\t', index=False, header=False)
print("SS_no4_SR_option2bCEFR_ls_electralarge exported to csv in path './predictions/trial/SS_no4_SR_option2bCEFR_ls_electralarge.tsv'\n")

SS_no4_SR_option2bCEFR_ls_electralarge exported to csv in path './predictions/trial/SS_no4_SR_option2bCEFR_ls_electralarge.tsv'



python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no4_SR_option2bCEFR_ls_electralarge.tsv --output_file ./output/trial/SS_no4_SR_option2bCEFR_ls_electralarge.tsv

### EFFLEX dataset: based on most frequent CEFR level (SR_option2c):
If the lemmatized version of the substitute is found in the ./cefr_efllex/EFLLex_mostfreq.tsv' file, and
If the POS tag of that word (as listed in './cefr_efllex/EFLLex_mostfreq.tsv) matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [28]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_mostfreq.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option3f_BSrobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

     # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    
    

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

         
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no4_SR_option2cCEFR_efl_mostfreq_electralarge.tsv', sep='\t', index=False, header=False)
print("SS_no4_SR_option2cCEFR_efl_mostfreq_electralarge exported to csv in path './predictions/trial/SS_no4_SR_option2cCEFR_efl_mostfreq_electralarge.tsv'\n")

SS_no4_SR_option2cCEFR_efl_mostfreq_electralarge exported to csv in path './predictions/trial/SS_no4_SR_option2cCEFR_efl_mostfreq_electralarge.tsv'



python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no4_SR_option2cCEFR_efl_mostfreq_electralarge.tsv --output_file ./output/trial/SS_no4_SR_option2cCEFR_efl_mostfreq_electralarge.tsv

### EFFLEX dataset: based on weighted average across CEFR levels (SR_option2d):

In [29]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_weighted.tsv', sep='\t', header=None, names=['word', 'pos', 'Weighted CEFR'])

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option3f_BSrobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its weighted CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

     # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    

    # map each lemmatized substitute to its weighted CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in EFLLex_weighted.tsv AND the POS tag of that word (in EFLLex_weighted.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['Weighted CEFR'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

    # sort the substitutes based on their weighted CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print (f"substitutes_cefr ranked on weighted average: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no4_SR_option2dCEFR_efl_weigthed_electralarge.tsv', sep='\t', index=False, header=False)
print("SS_no4_SR_option2dCEFR_efl_weigthed_electralarge exported to csv in path './predictions/trial/SS_no4_SR_option2dCEFR_efl_weigthed_electralarge'\n")

SS_no4_SR_option2dCEFR_efl_weigthed_electralarge exported to csv in path './predictions/trial/SS_no4_SR_option2dCEFR_efl_weigthed_electralarge'



python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no4_SR_option2dCEFR_efl_weigthed_electralarge.tsv --output_file ./output/trial/SS_no4_SR_option2dCEFR_efl_weigthed_electralarge.tsv

### all CEFR datasets combined: unique word-postag groups with duplicate CEFR scores have been averaged (SR_option2e):
code includes averages of substitutes not found in the combined CEFR database.

In [12]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_all/cefr_all_combined.tsv', sep='\t', header=None, names=['word', 'pos', 'Weighted CEFR'])

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option3f_BSrobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its weighted CEFR level, sort them, and save them into a new list
predictions_cefr = []
count_7 = 0  # counter of substitutes with level 7
total_rows = 0  # counter of total number of rows
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    
     # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    

    # map each lemmatized substitute to its weighted CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in './cefr_all/cefr_all_combined.tsv' AND the POS tag of that word (in './cefr_all/cefr_all_combined.tsv') is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['Weighted CEFR'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
            count_7 += 1  # add 1 to the counter per substitute with level 7

    total_rows += 1  # add 1 to the total number of rows

    # sort the substitutes based on their weighted CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    print (f"substitutes_cefr ranked on weighted average: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no4_SR_option2eCEFR_all_electralarge.tsv', sep='\t', index=False, header=False)
print("SS_no4_SR_option2eCEFR_all_electralarge exported to csv in path './predictions/trial/SS_no4_SR_option2eCEFR_all_electralarge'\n")

# calculate the average number of substitutes with level 7
average_7 = count_7 / total_rows
# print(f"On average, there were {average_7} substitutes of the provided 10 substitutes that had not been found in the combined CEFR dataset.")

substitutes_cefr ranked on weighted average: [('free', 1.9211746218154655), ('necessary', 3.097985119380673), ('illegal', 3.154739395588201), ('essential', 3.961447810323761), ('optional', 4.0), ('voluntary', 4.515403311860254), ('mandatory', 5.0), ('required', 7), ('mandated', 7), ('prohibited', 7)]

substitutes_cefr ranked on weighted average: [('infused', 7), ('fed', 7), ('blessed', 7), ('impressed', 7), ('captured', 7), ('reinforced', 7), ('packed', 7), ('stunned', 7), ('surprised', 7), ('cultivated', 7)]

substitutes_cefr ranked on weighted average: [('freaks', 7), ('thugs', 7), ('nazis', 7), ('monsters', 7), ('devotees', 7), ('minions', 7), ('criminals', 7), ('operators', 7), ('mania', 7), ('machines', 7)]

substitutes_cefr ranked on weighted average: [('police', 2.411472933295504), ('monitors', 7), ('diplomats', 7), ('experts', 7), ('observations', 7), ('reporters', 7), ('witnesses', 7), ('officials', 7), ('specialists', 7), ('analysts', 7)]

substitutes_cefr ranked on weighted 

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SS_no4_SR_option2eCEFR_all_electralarge.tsv --output_file ./output/trial/SS_no4_SR_option2eCEFR_all_electralarge.tsv

In [None]:
#averaging cefr levels across all substitutes

In [28]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_all/cefr_all_combined.tsv', sep='\t', header=None, names=['word', 'pos', 'Weighted CEFR'])

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option3f_BSrobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its weighted CEFR level, sort them, and save them into a new list
predictions_cefr = []
count_7 = 0  # counter of substitutes with level 7
total_rows = 0  # counter of total number of rows

total_cefr_levels = []  # to put all cefr levels across all rows in

for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    cefr_levels = []  # to put the cefr levels for the current row in
    
    # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))

    # map each lemmatized substitute to its weighted CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in './cefr_all/cefr_all_combined.tsv' AND the POS tag of that word (in './cefr_all/cefr_all_combined.tsv') is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            cefr_level = cefr_df[cefr_df['word'] == lemmatized]['Weighted CEFR'].values[0]
            substitutes_cefr.append((original, cefr_level))
            cefr_levels.append(cefr_level)
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
            count_7 += 1  # add 1 to the counter per substitute with level 7

    total_rows += 1  # add 1 to the total number of rows

    # sort the substitutes based on their weighted CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    
    print(f"substitutes_cefr ranked on weighted average: {ranked_cefr_subs}\n")
    sorted_cefr_levels = sorted(cefr_levels)
    print(f"sorted_cefr_levels: {sorted_cefr_levels}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

    total_cefr_levels.extend(cefr_levels)  # add cefr levels of the current row to the total cefr levels
    

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no4_SR_option2eCEFR_all_electralarge.tsv', sep='\t', index=False, header=False)
print("SS_no4_SR_option2eCEFR_all_electralarge exported to csv in path './predictions/trial/SS_no4_SR_option2eCEFR_all_electralarge'\n")

# calculate the average number of substitutes with level 7
average_7 = count_7 / total_rows
print(f"On average, there were {average_7} substitutes of the provided 10 substitutes that had not been found in the combined CEFR dataset.")

# calculate average CEFR level over all rows
if total_cefr_levels:
    average_cefr_level_total = sum(total_cefr_levels) / len(total_cefr_levels)
else:
    average_cefr_level_total = None

print(f"Average CEFR level over all rows: {average_cefr_level_total}")



substitutes_cefr ranked on weighted average: [('free', 1.9211746218154655), ('necessary', 3.097985119380673), ('illegal', 3.154739395588201), ('essential', 3.961447810323761), ('optional', 4.0), ('voluntary', 4.515403311860254), ('mandatory', 5.0), ('required', 7), ('mandated', 7), ('prohibited', 7)]

sorted_cefr_levels: [1.9211746218154655, 3.097985119380673, 3.154739395588201, 3.961447810323761, 4.0, 4.515403311860254, 5.0]

substitutes_cefr ranked on weighted average: [('infused', 7), ('fed', 7), ('blessed', 7), ('impressed', 7), ('captured', 7), ('reinforced', 7), ('packed', 7), ('stunned', 7), ('surprised', 7), ('cultivated', 7)]

sorted_cefr_levels: []

substitutes_cefr ranked on weighted average: [('freaks', 7), ('thugs', 7), ('nazis', 7), ('monsters', 7), ('devotees', 7), ('minions', 7), ('criminals', 7), ('operators', 7), ('mania', 7), ('machines', 7)]

sorted_cefr_levels: []

substitutes_cefr ranked on weighted average: [('police', 2.411472933295504), ('monitors', 7), ('diplo

In [None]:
#averaging cefr levels across all complex words (however, pos tag not taken into account)

In [35]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_all/cefr_all_combined.tsv', sep='\t', header=None, names=['word', 'pos', 'Weighted CEFR'])

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option3f_BSrobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its weighted CEFR level, sort them, and save them into a new list
predictions_cefr = []
cefr_levels = []  # list to store CEFR levels of all substitutes (except those with level 7)
complex_word_cefr_levels = []  # list to store CEFR levels of the complex words

for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
    
    
    # get the CEFR level of the complex word and append it to complex_word_cefr_levels
    if complex_word in cefr_df['word'].values:
        complex_word_cefr = cefr_df[cefr_df['word'] == complex_word]['Weighted CEFR'].values[0]
        complex_word_cefr_levels.append(complex_word_cefr)

 
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))

    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['Weighted CEFR'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  

    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

    # collect the CEFR levels of the substitutes, except those with level 7
    cefr_levels.extend(cefr for sub, cefr in ranked_cefr_subs if cefr < 7)


# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no4_SR_option2eCEFR_all_electralarge.tsv', sep='\t', index=False, header=False)
              
# calculate average CEFR level of the complex words
if complex_word_cefr_levels:
    average_complex_word_cefr = sum(complex_word_cefr_levels) / len(complex_word_cefr_levels)
    print(f"Average CEFR level for complex words: {average_complex_word_cefr}")
else:
    print("No CEFR level found for the complex words.")

# calculate average CEFR level of the substitutes (except those with level 7)
if cefr_levels:
    average_cefr_level = sum(cefr_levels) / len(cefr_levels)
    print(f"Average CEFR level for substitutes (except level 7): {average_cefr_level}")
else:
    print("No CEFR level found for the substitutes.")

Average CEFR level for complex words: 4.75
Average CEFR level for substitutes (except level 7): 3.8730174455392214


In [None]:
#averaging cefr levels across all complex words (with pos tag taken into account)

In [39]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_all/cefr_all_combined.tsv', sep='\t', header=None, names=['word', 'pos', 'Weighted CEFR'])

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/trial/SS_phase2_option3f_BSrobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its weighted CEFR level, sort them, and save them into a new list
predictions_cefr = []
cefr_levels = []  # list to store CEFR levels of all substitutes (except those with level 7)
complex_word_cefr_levels = []  # list to store CEFR levels of the complex words

for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]


    # get the POS of the complex word and its lemma
    complex_word_pos = dict(pos_tag(word_tokenize(sentence))).get(complex_word)
    complex_word_lemma = nlp(complex_word)[0].lemma_ if complex_word in [token.text for token in nlp(complex_word)] else complex_word

    # get the CEFR level of the complex word and append it to complex_word_cefr_levels
    if complex_word_lemma in cefr_df['word'].values and cefr_df[cefr_df['word'] == complex_word_lemma]['pos'].values[0] == complex_word_pos:
        complex_word_cefr = cefr_df[cefr_df['word'] == complex_word_lemma]['Weighted CEFR'].values[0]
        complex_word_cefr_levels.append(complex_word_cefr)


    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))

    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['Weighted CEFR'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  
            

    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

    # collect the CEFR levels of the substitutes, except those with level 7
    cefr_levels.extend(cefr for sub, cefr in ranked_cefr_subs if cefr < 7)


# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/trial/SS_no4_SR_option2eCEFR_all_electralarge.tsv', sep='\t', index=False, header=False)
              
# calculate average CEFR level of the complex words
if complex_word_cefr_levels:
    average_complex_word_cefr = sum(complex_word_cefr_levels) / len(complex_word_cefr_levels)
    print(f"Average CEFR level for complex words: {average_complex_word_cefr}")
else:
    print("No CEFR level found for the complex words.")

# calculate average CEFR level of the substitutes (except those with level 7)
if cefr_levels:
    average_cefr_level = sum(cefr_levels) / len(cefr_levels)
    print(f"Average CEFR level for substitutes (except level 7): {average_cefr_level}")
else:
    print("No CEFR level found for the substitutes.")



Average CEFR level for complex words: 4.75
Average CEFR level for substitutes (except level 7): 3.8730174455392214
