## TEST set:

In [16]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

In [17]:
from nltk.corpus import wordnet as wn
import spacy
nlp = spacy.load("en_core_web_sm")

In [18]:
# map spaCy PoS tags to WordNet PoS tags
def map_pos_spacy_wordnet(pos_spacy):
    pos_map = {
        'NOUN': wn.NOUN,
        'VERB': wn.VERB,
        'ADJ': wn.ADJ,
        'ADV': wn.ADV
    }
    return pos_map.get(pos_spacy, wn.NOUN) # default to NOUN if pos_spacy does not exist in the dict



## for CEFR-J dataset:

### for model  SS_bsRobertalarge_robertabase:

### with pos tag of the substitutes taken into account:


If the lemmatized version of the substitute is found in the 'cefrj_all_treebank.tsv' file, and
If the POS tag of that word (as listed in 'cefrj_all_treebank.tsv') matches the POS tag of the substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [6]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    #print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")


    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_j.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefr_j exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_j.tsv'\n")

SS_bsRobertalarge_robertabase_SR_cefr_j exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_j.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_j.tsv --output_file ./output/test/SS_bsRobertalarge_robertabase_SR_cefr_j.tsv

### for model SS_bsRobertalarge_electralarge:


### with pos tag of the substitutes taken into account:


If the lemmatized version of the substitute is found in the 'cefrj_all_treebank.tsv' file, and
If the POS tag of that word (as listed in 'cefrj_all_treebank.tsv') matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [7]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all_treebank.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    
    

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in cefrj_all_treebank.tsv AND the POS tag of that word (in cefrj_all_treebank.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

         
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_j.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefr_j exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_j.tsv'\n")

SS_bsRobertalarge_electralarge_SR_cefr_j exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_j.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_j.tsv --output_file ./output/test/SS_bsRobertalarge_electralarge_SR_cefr_j.tsv

## for Uchida et al dataset (CEFR-LS):

### for model SS_bsRobertalarge_robertabase:

### with pos tag of the substitutes taken into account:


If the lemmatized version of the substitute is found in the './cefr/uchida_pos.tsv' file, and
If the POS tag of that word (as listed in './cefr/uchida_pos.tsv) matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [8]:
# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_ls/uchida_pos.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")


    
    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in './cefr/uchida_pos.tsv AND the POS tag of that word ('./cefr/uchida_pos.tsv is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match


    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])



# create a new dataframe from the new lists and write it to a new TSV file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_ls.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefr_ls exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_ls.tsv'\n")


SS_bsRobertalarge_robertabase_SR_cefr_ls exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_ls.tsv'



python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_ls.tsv --output_file ./output/test/SS_bsRobertalarge_robertabase_SR_cefr_ls.tsv

### for model SS_bsRobertalarge_electralarge:

### with pos tag of the substitutes taken into account:


If the lemmatized version of the substitute is found in the './cefr/uchida_pos.tsv' file, and
If the POS tag of that word (as listed in './cefr/uchida_pos.tsv) matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [9]:


# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_ls/uchida_pos.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    
    # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    


    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in './cefr/uchida_pos.tsv AND the POS tag of that word ('./cefr/uchida_pos.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
 
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])




# create a new dataframe from the new lists and write it to a new TSV file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_ls.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefr_ls exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_ls.tsv'\n")


SS_bsRobertalarge_electralarge_SR_cefr_ls exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_ls.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_ls.tsv --output_file ./output/test/SS_bsRobertalarge_electralarge_SR_cefr_ls.tsv

### EFFLEX dataset: based on most frequent CEFR level:

### for model SS_bsRobertalarge_robertabase::

### with pos tag of the substitutes taken into account:


If the lemmatized version of the substitute is found in the ./cefr_efllex/EFLLex_mostfreq.tsv' file, and
If the POS tag of that word (as listed in './cefr_efllex/EFLLex_mostfreq.tsv) matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [10]:


# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_mostfreq.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    
    # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")
    

    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in'./cefr_efllex/EFLLex_mostfreq AND the POS tag of that word ('./cefr_efllex/EFLLex_mostfreq) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

         
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_efl_mostfreq.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefr_efl_mostfreq exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_efl_mostfreq.tsv'\n")

SS_bsRobertalarge_robertabase_SR_cefr_efl_mostfreq exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_efl_mostfreq.tsv'



python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_efl_mostfreq.tsv --output_file ./output/test/SS_bsRobertalarge_robertabase_SR_cefr_efl_mostfreq.tsv

### for model SS_bsRobertalarge_electralarge:

#### with pos tag of substitutes taken into account:


If the lemmatized version of the substitute is found in the './cefr_efllex/EFLLex_mostfreq.tsv' file, and
If the POS tag of that word (as listed in ./cefr_efllex/EFLLex_mostfreq.tsv') matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

In [11]:

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_mostfreq.tsv', sep='\t', header=None, names=['word', 'pos', 'cefr'])

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the CEFR levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    
    # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")


    # map each lemmatized substitute to its CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in './cefr_efllex/EFLLex_mostfreq AND the POS tag of that word (in './cefr_efllex/EFLLex_mostfreq) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['cefr'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

         
   

    # sort the substitutes based on their CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"ranked_cefr_subs: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_efl_mostfreq.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefr_efl_mostfreq exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_efl_mostfreq.tsv'\n")

SS_bsRobertalarge_electralarge_SR_cefr_efl_mostfreq exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_efl_mostfreq.tsv'



python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_efl_mostfreq.tsv --output_file ./output/test/SS_bsRobertalarge_electralarge_SR_cefr_efl_mostfreq.tsv

### EFFLEX dataset: based on weighted average across CEFR levels:

### with pos tag of the substitutes taken into account:

If the lemmatized version of the substitute is found in the './cefr_efllex/EFLLex_weighted.tsv' file, and
If the POS tag of that word (as listed in './cefr_efllex/EFLLex.tsv') matches the POS tag of the original substitute word (as determined by parsing the sentence where the complex word is replaced by the original substitute).

### for model SS_bsRobertalarge_robertabase:

In [19]:

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_weighted.tsv', sep='\t', header=None, names=['word', 'pos', 'Weighted CEFR'])

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its weighted CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    
    # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")


    # map each lemmatized substitute to its weighted CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in './cefr_efllex/EFLLex_weighted.tsv AND the POS tag of that word ('./cefr_efllex/EFLLex_weighted.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['Weighted CEFR'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

    # sort the substitutes based on their weighted CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print (f"substitutes_cefr ranked on weighted average: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_efl_weighted.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefr_efl_weighted exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_efl_weighted'\n")


SS_bsRobertalarge_robertabase_SR_cefr_efl_weighted exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_efl_weighted'



python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_efl_weighted.tsv --output_file ./output/test/SS_bsRobertalarge_robertabase_SR_cefr_efl_weighted.tsv

### for model SS_bsRobertalarge_electralarge:

In [13]:

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_efllex/EFLLex_weighted.tsv', sep='\t', header=None, names=['word', 'pos', 'Weighted CEFR'])

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its weighted CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    
    # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")


    # map each lemmatized substitute to its weighted CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in './cefr_efllex/EFLLex_weighted.tsv AND the POS tag of that word (in './cefr_efllex/EFLLex_weighted.tsv) is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['Weighted CEFR'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

    # sort the substitutes based on their weighted CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print (f"substitutes_cefr ranked on weighted average: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_efl_weighted.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefr_efl_weighted exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_efl_weighted.tsv'\n")


SS_bsRobertalarge_electralarge_SR_cefr_efl_weighted exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_efl_weighted.tsv'



python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_efl_weighted.tsv --output_file ./output/test/SS_bsRobertalarge_electralarge_SR_cefr_efl_weighted.tsv

### all CEFR datasets combined: (unique word-postag groups with duplicate CEFR scores have been averaged):  

### for model SS_bsRobertalarge_robertabase:

In [14]:


# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_all/cefr_all_combined.tsv', sep='\t', header=None, names=['word', 'pos', 'Weighted CEFR'])

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its weighted CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

   
    # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")

    # map each lemmatized substitute to its weighted CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in './cefr_all/cefr_all_combined.tsv' AND the POS tag of that word (in './cefr_all/cefr_all_combined.tsv') is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['Weighted CEFR'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

    # sort the substitutes based on their weighted CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print (f"substitutes_cefr ranked on weighted average: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_all.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefr_all exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_all'\n")

SS_bsRobertalarge_robertabase_SR_cefr_all exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_all'



python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_all.tsv --output_file ./output/test/SS_bsRobertalarge_robertabase_SR_cefr_all.tsv

### code updated to present averages of substitutes not found in the combined CEFR database

In [20]:

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_all/cefr_all_combined.tsv', sep='\t', header=None, names=['word', 'pos', 'Weighted CEFR'])

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its weighted CEFR level, sort them, and save them into a new list
predictions_cefr = []
count_7 = 0  # counter of substitutes with level 7
total_rows = 0  # counter of total number of rows
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    
    # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")


    # map each lemmatized substitute to its weighted CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in './cefr_all/cefr_all_combined.tsv' AND the POS tag of that word (in './cefr_all/cefr_all_combined.tsv') is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['Weighted CEFR'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
            count_7 += 1  # add 1 to the counter per substitute with level 7

    total_rows += 1  # add 1 to the total number of rows

    # sort the substitutes based on their weighted CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    #print (f"substitutes_cefr ranked on weighted average: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_all.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefr_all exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_all'\n")

# calculate the average number of substitutes with level 7
average_7 = count_7 / total_rows
print(f"On average, there were {average_7} substitutes of the provided 10 substitutes that had not been found in the combined CEFR dataset.")

SS_bsRobertalarge_robertabase_SR_cefr_all exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefr_all'

On average, there were 6.088709677419355 substitutes of the provided 10 substitutes that had not been found in the combined CEFR dataset.


### for model SS_bsRobertalarge_electralarge:

In [15]:

# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_all/cefr_all_combined.tsv', sep='\t', header=None, names=['word', 'pos', 'Weighted CEFR'])

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its weighted CEFR level, sort them, and save them into a new list
predictions_cefr = []
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]
   
    # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")


    # map each lemmatized substitute to its weighted CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in './cefr_all/cefr_all_combined.tsv' AND the POS tag of that word (in './cefr_all/cefr_all_combined.tsv') is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['Weighted CEFR'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match

    # sort the substitutes based on their weighted CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print (f"substitutes_cefr ranked on weighted average: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_all.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefr_all exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_all'\n")

SS_bsRobertalarge_electralarge_SR_cefr_all exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_all'



python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_all.tsv --output_file ./output/test/SS_bsRobertalarge_electralarge_SR_cefr_all.tsv

### code updated to present averages of substitutes not found in the combined CEFR database

In [22]:


# read the CEFR levels file into a dataframe
cefr_df = pd.read_csv('./cefr_all/cefr_all_combined.tsv', sep='\t', header=None, names=['word', 'pos', 'Weighted CEFR'])

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its weighted CEFR level, sort them, and save them into a new list
predictions_cefr = []
count_7 = 0  # counter of substitutes with level 7
total_rows = 0  # counter of total number of rows
for index, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    
   # replace the complex word in the sentence with the substitute, and parse it to get the pos tag of the substitute
    substitute_pos = []
    for substitute in substitutes:
        replaced_sentence = sentence.replace(complex_word, substitute)
        doc = nlp(replaced_sentence)
        pos = [token.pos_ for token in doc if token.text == substitute][0]
        substitute_pos.append((substitute, pos))
    
    # get the lemma of the substitute based on its pos tag
    substitutes_lemmas = []
    for sub_pos in substitute_pos:
        substitute, pos_spacy = sub_pos
        pos_substitute_wordnet = map_pos_spacy_wordnet(pos_spacy)
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_ if substitute in [token.text for token in doc_substitute] else substitute
        substitutes_lemmas.append((substitute, substitute_lemma))
    # print(f"Substitutes with their lemmas: {substitutes_lemmas}\n")


    # map each lemmatized substitute to its weighted CEFR level, or to a high number if it doesn't have a CEFR level
    substitutes_cefr = []
    for original, lemmatized in substitutes_lemmas:
        # get the pos of the original substitute by parsing the sentence where the complex word is replaced by the substitute
        sub_sentence = sentence.replace(complex_word, original)
        sub_pos = dict(pos_tag(word_tokenize(sub_sentence))).get(original)
        # if the lemmatized substitute equals a word that is found in './cefr_all/cefr_all_combined.tsv' AND the POS tag of that word (in './cefr_all/cefr_all_combined.tsv') is the same as the POS tag of the substitute:
        if lemmatized in cefr_df['word'].values and cefr_df[cefr_df['word'] == lemmatized]['pos'].values[0] == sub_pos:
            substitutes_cefr.append((original, cefr_df[cefr_df['word'] == lemmatized]['Weighted CEFR'].values[0]))
        else:
            substitutes_cefr.append((original, 7))  # assign a high value if it doesn't have a CEFR level or if pos don't match
            count_7 += 1  # add 1 to the counter per substitute with level 7

    total_rows += 1  # add 1 to the total number of rows

    # sort the substitutes based on their weighted CEFR levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print (f"substitutes_cefr ranked on weighted average: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    predictions_cefr.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(predictions_cefr)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_all.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefr_all exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_all'\n")

# calculate the average number of substitutes with level 7
average_7 = count_7 / total_rows
print(f"On average, there were {average_7} substitutes of the provided 10 substitutes that had not been found in the combined CEFR dataset.")

SS_bsRobertalarge_electralarge_SR_cefr_all exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefr_all'

On average, there were 5.830645161290323 substitutes of the provided 10 substitutes that had not been found in the combined CEFR dataset.
