## TEST set:

## for CEFR-J dataset:

### for model SG_MA_SS_bsRobertalarge_robertabase:

### lemmatized:WITH SPACY

In [3]:
import pandas as pd
import spacy

# Load the English SpaCy model
nlp = spacy.load('en_core_web_sm')

# read the cefr levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all.tsv', sep='\t', header=None, names=['word', 'cefr'])

# define a mapping from cefr levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the cefr levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# lemmatize the words and then convert the cefr dataframe into a dictionary for efficient lookups
cefr_df['word'] = cefr_df['word'].apply(lambda x: nlp(x)[0].lemma_)
cefr_dict = cefr_df.set_index('word')['cefr'].to_dict()

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its cefr level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, nlp(sub)[0].lemma_) for sub in substitutes]
    
    # map each lemmatized substitute to its cefr level, or to a high number if it doesn't have a cefr level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    # print(f"substitutes with cefr levels mapped to numerical values: {substitutes_cefr}\n")

    # sort the substitutes based on their cefr levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"ranked substitutes based on their cefr level mapped to numerical values: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrj_lemSpacy.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefrj_lemSpacy exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrj_lemSpacy.tsv'\n")




SS_bsRobertalarge_robertabase_SR_cefrj_lemSpacy exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefrj_lemSpacy.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefrj_lemSpacy.tsv --output_file ./output/test/SS_bsRobertalarge_robertabase_SR_cefrj_lemSpacy.tsv

### for model SG_MA_SS_bsRobertalarge_electralarge:


lemmatized with spacy

In [4]:
import pandas as pd
import spacy

# Load the English SpaCy model
nlp = spacy.load('en_core_web_sm')

# read the cefr levels file into a dataframe
cefr_df = pd.read_csv('./cefrj/cefrj_all.tsv', sep='\t', header=None, names=['word', 'cefr'])

# define a mapping from cefr levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# map the cefr levels in the dataframe to numerical values using the mapping
cefr_df['cefr'] = cefr_df['cefr'].map(cefr_level_mapping)

# lemmatize the words and then convert the cefr dataframe into a dictionary for efficient lookups
cefr_df['word'] = cefr_df['word'].apply(lambda x: nlp(x)[0].lemma_)
cefr_dict = cefr_df.set_index('word')['cefr'].to_dict()

# read the predictions file into a dataframe
pred_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# for each row in the predictions dataframe, map each substitute to its cefr level, sort them, and save them into a new list
new_lists = []
for i, row in pred_df.iterrows():
    sentence = row[0]
    complex_word = row[1]
    substitutes = row[2:12]

    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, nlp(sub)[0].lemma_) for sub in substitutes]
    
    # map each lemmatized substitute to its cefr level, or to a high number if it doesn't have a cefr level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    # print(f"substitutes with cefr levels mapped to numerical values: {substitutes_cefr}\n")

    # sort the substitutes based on their cefr levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(f"ranked substitutes based on their cefr level mapped to numerical values: {ranked_cefr_subs}\n")

    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrj_lemSpacy.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefrj_lemSpacy exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrj_lemSpacy.tsv'\n")


SS_bsRobertalarge_electralarge_SR_cefrj_lemSpacy exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefrj_lemSpacy.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefrj_lemSpacy.tsv --output_file ./output/test/SS_bsRobertalarge_electralarge_SR_cefrj_lemSpacy.tsv

## for Uchida et al dataset:

### for model SG_MA_SS_bsRobertalarge_robertabase:

### lemmatized:

In [5]:
import pandas as pd
import spacy

# Load the English SpaCy model
nlp = spacy.load('en_core_web_sm')

# read cefr file
cefr_df = pd.read_csv('./cefr/uchida.tsv', sep='\t', header=None, usecols=range(21))

# read prediction file
predictions_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_robertabase.tsv', sep='\t', header=None)

# convert CEFR levels to numerical values for ranking
cefr_level_mapping = {"A1": 1, "A2": 2, "B1": 3, "B2": 4, "C1": 5, "C2": 6}

# create a dictionary with substitute (lemmatized) as key and its corresponding CEFR level as value
cefr_dict = {}
for i in range(3, 21, 3):
    for word, cefr_level, label in zip(cefr_df[i], cefr_df[i + 1], cefr_df[i + 2]):
        if label == 1:  # only consider the substitute if the label is 1
            lemmatized_word = nlp(word)[0].lemma_
            cefr_dict[lemmatized_word] = cefr_level_mapping.get(cefr_level, 7)  # default to 7 if CEFR level is not found

# create a new list to store the sorted substitutes with their sentence and complex word
new_lists = []

# map each substitute to its CEFR level and sort them
for sentence, complex_word, *substitutes in predictions_df.values:
    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, nlp(sub)[0].lemma_) for sub in substitutes]
    
    # map each lemmatized substitute to its cefr level, or to a high number if it doesn't have a cefr level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    
    # sort the substitutes based on their cefr levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(ranked_cefr_subs)
    
    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])

# create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lemSpacy.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_robertabase_SR_cefruchida_lemSpacy exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lemSpacy.tsv'\n")


SS_bsRobertalarge_robertabase_SR_cefruchida_lemSpacy exported to csv in path './predictions/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lemSpacy.tsv'



In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lemSpacy.tsv --output_file ./output/test/SS_bsRobertalarge_robertabase_SR_cefruchida_lemSpacy.tsv

### for model SG_MA_SS_bsRobertalarge_electralarge:

### lemmatized:

In [6]:
import pandas as pd
import spacy

# Load the English SpaCy model
nlp = spacy.load('en_core_web_sm')

# read cefr file
cefr_df = pd.read_csv('./cefr/uchida.tsv', sep='\t', header=None, usecols=range(21))

# read prediction file
predictions_df = pd.read_csv('./predictions/test/SG_MA_SS_bsRobertalarge_electralarge.tsv', sep='\t', header=None)

# convert CEFR levels to numerical values for ranking
cefr_level_mapping = {"A1": 1, "A2": 2, "B1": 3, "B2": 4, "C1": 5, "C2": 6}

# create a dictionary with substitute (lemmatized) as key and its corresponding CEFR level as value
cefr_dict = {}
for i in range(3, 21, 3):
    for word, cefr_level, label in zip(cefr_df[i], cefr_df[i + 1], cefr_df[i + 2]):
        if label == 1:  # only consider the substitute if the label is 1
            lemmatized_word = nlp(word)[0].lemma_
            cefr_dict[lemmatized_word] = cefr_level_mapping.get(cefr_level, 7)  # default to 7 if CEFR level is not found

# create a new list to store the sorted substitutes with their sentence and complex word
new_lists = []

# map each substitute to its CEFR level and sort them
for sentence, complex_word, *substitutes in predictions_df.values:
    # lemmatize the substitutes but keep original form too
    substitutes_lemmatized = [(sub, nlp(sub)[0].lemma_) for sub in substitutes]
    
    # map each lemmatized substitute to its cefr level, or to a high number if it doesn't have a cefr level
    substitutes_cefr = [(original, cefr_dict.get(lemmatized, 7)) for original, lemmatized in substitutes_lemmatized]
    
    # sort the substitutes based on their cefr levels
    ranked_cefr_subs = sorted(substitutes_cefr, key=lambda x: x[1])
    # print(ranked_cefr_subs)
    
    # append the sorted list of substitutes to the new lists, keeping original form
    new_lists.append([sentence, complex_word] + [sub for sub, _ in ranked_cefr_subs])
    
    
# # create a new dataframe from the new lists and write it to a new tsv file
new_df = pd.DataFrame(new_lists)
new_df.to_csv('./predictions/test/SS_bsRobertalarge_electralarge_SR_cefruchida_lemSpacy.tsv', sep='\t', index=False, header=False)
print("SS_bsRobertalarge_electralarge_SR_cefruchida_lemSpacy exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefruchida_lemSpacy.tsv'}\n")


SS_bsRobertalarge_electralarge_SR_cefruchida_lemSpacy exported to csv in path './predictions/test/SS_bsRobertalarge_electralarge_SR_cefruchida_lemSpacy.tsv'}



python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SS_bsRobertalarge_electralarge_SR_cefruchida_lemSpacy.tsv --output_file ./output/test/SSS_bsRobertalarge_electralarge_SR_cefruchida_lemSpacy.tsv