### Datafiles preprocessing

In [80]:
import pandas as pd
import re
import csv
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

### Remove unwanted chars from the trial and test sentences to prevent the code from crashing
and store the results in the files:
- tsar2022_en_trial_none_no_noise.tsv 
- tsar2022_en_trial_gold_no_noise.tsv 
- tsar2022_en_test_none_no_noise.tsv
- tsar2022_en_test_gold_no_noise.tsv

These files will be used in all further processes.

##### trial dataset (for none and gold):

In [1]:
# read the tsv file
filename = "./data/trial/tsar2022_en_trial_none.tsv"
data = pd.read_csv(filename, sep='\t', header=None, names=["sentence", "complex_word"])


# remove character combinations starting with # and quotes
pattern = r'#\d+-\d+(?: ")?'
data['sentence'] = data['sentence'].str.replace(pattern, '', regex=True)

# save the updated df to a new tsv
new_filename = "./data/trial/tsar2022_en_trial_none_no_noise.tsv"
data.to_csv(new_filename, sep='\t', header=False, index=False)

In [2]:
# read the tsv file
filename = "./data/trial/tsar2022_en_trial_gold.tsv"

# define column names 
col_names = ["sentence", "complex_word"] + [f"extra_col{i}" for i in range(1, 27)]

data = pd.read_csv(filename, sep='\t', header=None, names=col_names)

# remove character combinations starting with # and quotes
pattern = r'#\d+-\d+(?: ")?'
data['sentence'] = data['sentence'].str.replace(pattern, '', regex=True)


# save the updated df to a new tsv
new_filename = "./data/trial/tsar2022_en_trial_gold_no_noise.tsv"
data.to_csv(new_filename, sep='\t', header=False, index=False)

##### test dataset (for none and gold):

In [3]:
# read the tsv file
filename = "./data/test/tsar2022_en_test_none.tsv"
data = pd.read_csv(filename, sep='\t', header=None, names=["sentence", "complex_word"])


# remove character combinations starting with # and quotes
pattern = r'#\d+-\d+(?: ")?'
data['sentence'] = data['sentence'].str.replace(pattern, '', regex=True)

# write the updated df to a new tsv
new_filename = "./data/test/tsar2022_en_test_none_no_noise.tsv"
data.to_csv(new_filename, sep='\t', header=False, index=False)

In [4]:
# read the tsv file
filename = "./data/test/tsar2022_en_test_gold.tsv"

# define column names
col_names = ["sentence", "complex_word"] + [f"extra_col{i}" for i in range(1, 27)]

data = pd.read_csv(filename, sep='\t', header=None, names=col_names)

# remove character combinations starting with # and quotes
pattern = r'#\d+-\d+(?: ")?'
data['sentence'] = data['sentence'].str.replace(pattern, '', regex=True)


# write the data to a new tsv
new_filename = "./data/test/tsar2022_en_test_gold_no_noise.tsv"
data.to_csv(new_filename, sep='\t', header=False, index=False)

### EFLLEX file

#### remove unwanted headers and columns from EFLLEX file:

In [1]:
with open('./cefr_efllex/EFLLex_NLP4J_ORIG.tsv', 'r', encoding='utf-8') as tsv_file:
    tsv_reader = csv.reader(tsv_file, delimiter='\t')
    next(tsv_reader, None)  # skip the headers
    rows = [row[:-106] for row in tsv_reader]  # remove the last 105 columns


with open('./cefr_efllex/EFLLex_trimmed.tsv', 'w', newline='', encoding='utf-8') as tsv_file:
    tsv_writer = csv.writer(tsv_file, delimiter='\t')
    tsv_writer.writerows(rows)

#### Option 1: assign most frequent cefr level to word in EFLLEX dataset:

In [54]:

# define column names
column_names = ['word', 'pos tag', 'A1', 'A2', 'B1', 'B2', 'C1']

# load tsv file
df = pd.read_csv('./cefr_efllex/EFLLex_trimmed.tsv', sep='\t', header=None, names=column_names)

# define CEFR levels
cefr_levels = ['A1', 'A2', 'B1', 'B2', 'C1',]

# extract column names with the highest values and add the result to a new column
df['Highest CEFR'] = df[cefr_levels].idxmax(axis=1)

# create a new df with the needed columns
df_new = df[['word', 'pos tag', 'mostfreq CEFR']]


# write the new df to a new tsv 
df_new.to_csv('./cefr_efllex/EFLLex_mostfreq.tsv', sep='\t', header = False, index=False)


#### Option 2: take weighted average to assign cefr level to word in efflex dataset.:

In [55]:

# define column names
column_names = ['word', 'pos tag', 'A1', 'A2', 'B1', 'B2', 'C1']

# load tsv file
df = pd.read_csv('./cefr_efllex/EFLLex_trimmed.tsv', sep='\t', header=None, names=column_names)

# define CEFR levels 
cefr_levels = ['A1', 'A2', 'B1', 'B2', 'C1']

# define a mapping from CEFR levels to numerical values
mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5}

# create a new df for the weighted frequencies
df_weighted = pd.DataFrame()

# calculate the weighted frequencies for each CEFR level
for level in cefr_levels:
    df_weighted[level] = df[level] * mapping[level]

# sum the weighted frequencies across the CEFR levels for each word
df['Weighted Sum'] = df_weighted.sum(axis=1)

# sum the frequencies across the CEFR levels for each word
df['Frequency Sum'] = df[cefr_levels].sum(axis=1)

# calculate the weighted average for each word
df['Weighted CEFR'] = df['Weighted Sum'] / df['Frequency Sum']

# create a new df with the needed columns
df_new = df[['word', 'pos tag', 'Weighted CEFR']]

# write the new df to a new tsv
df_new.to_csv('./cefr_efllex/EFLLex_weighted.tsv', sep='\t', header = False, index=False)


### CEFRJ file

#### Concatenate cefrj files

In [None]:
# read the tsv files and remove the header row
df1 = pd.read_csv('./cefrj/cefrj_1.csv', skiprows=1, header=None)
df2 = pd.read_csv('./cefrj/cefrj_2.csv', skiprows=1, header=None)

In [None]:
# select the first three columns
df1 = df1.iloc[:, :3]
df2 = df2.iloc[:, :3]

# concatenate the df's
df = pd.concat([df1, df2])

# write the combined data to a new tsv 
df.to_csv('./cefrj/cefrj_all.tsv', sep='\t', index=False, header=False)

#### Map existing pos tags to Treebank tags

In [None]:

# map existing PoS tags to Treebank tags
def map_to_treebank_tag(existing_tag):
    if existing_tag == 'adjective':
        return 'JJ'  # Adjective
    elif simple_tag == 'verb':
        return 'VB'  # Verb
    elif simple_tag == 'noun':
        return 'NN'  # Noun
    elif simple_tag == 'adverb':
        return 'RB'  # Adverb
    else:
        return None

# open existing cefrj file
with open('./cefrj/cefrj_all.tsv', 'r') as infile:
    reader = csv.reader(infile, delimiter='\t')

    # open new file
    with open('./cefrj/cefrj_all_treebank.tsv', 'w') as outfile:
        writer = csv.writer(outfile, delimiter='\t')
        
        # loop over rows in the existing cefrj file
        for row in reader:
            word = row[0]
            pos_tag = row[1]
            cefr_level = row[2]

            # map the PoS tag to the treebank PoS tags
            treebank_pos = map_to_treebank_tag(pos_tag)

            # if the tag is one of the four Treebank tags, write to the new file
            if treebank_pos is not None:
                writer.writerow([word, treebank_pos, cefr_level])

#### Map cefr levels to numerical values

In [61]:

# load the data
df = pd.read_csv('./cefrj/cefrj_all_treebank.tsv', sep='\t', header=None)

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# apply the mapping to the third column
df[2] = df[2].map(cefr_level_mapping)

# write the updated df to a new tsv
df.to_csv('./cefr_all/cefrj_num.tsv', sep='\t', header=False, index=False)


### Uchida file (CEFR_LS)

#### Remove redundant columns from the uchida file

In [77]:

# load the df
df = pd.read_csv('./cefr_ls/uchida.tsv', sep='\t', header=None, usecols=range(24))

# remove redundant columns 
remove_cols = [5, 8, 11, 14, 17, 20, 23]
df.drop(df.columns[remove_cols], axis=1, inplace=True)

# write the df to a new tsv
df.to_csv('./cefr_ls/uchida_trimmed.tsv', sep='\t', header=False, index=False)




#### Parse the sentences to get the pos tags of the words (for example, this is needed in case of same words with different CEFR levels based on their pos in the sentence)

In [78]:

# convert NLTK POS tags 
def convert_pos_tag(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'JJ'
    elif treebank_tag.startswith('V'):
        return 'VB'
    elif treebank_tag.startswith('N'):
        return 'NN'
    elif treebank_tag.startswith('R'):
        return 'RB'
    else:
        return ''

# read the tsv file
df = pd.read_csv('./cefr_ls/uchida_trimmed.tsv', sep='\t', header=None)

# initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# define a new column for the POS tag
df[17] = ''

for index, row in df.iterrows():
    sentence = row[0]
    target_word = row[1]

    # tokenize the sentence
    words = word_tokenize(sentence)

    # get PoS tags for the words
    tagged_words = pos_tag(words)

    # lemmatize words and compare with the target word
    for word, tag in tagged_words:
        tag = convert_pos_tag(tag)  # convert the POS tag
        lemma = lemmatizer.lemmatize(word, 'n')  # 'n' is used as a default PoS if the word cannot be pos tagged
        if lemma == target_word and tag:
            df.at[index, 17] = tag

# filter the df to only include rows where the new POS column is not empty
df = df[df[17] != '']

# remove the first column
df = df.drop(columns=[0])

# insert the values from the last column to a new column after the first one
df.insert(1, 'POS', df[17])

# iterate through the positions of the columns and insert PoS columns 
insert_positions = [4, 7, 10, 13, 16, 19, 22]
for i, pos in enumerate(insert_positions):
    df.insert(pos, f'POS_copy_{i+1}', df['POS'])

# remove the last column
df = df.drop(columns=[17])

# write the df to a new tsv
df.to_csv('./cefr_ls/uchida_trimmed_parsed.tsv', sep='\t', header=False, index=False)

#### Standardize format into three colums: word, pos, cefr level, and remove duplicates

In [79]:

# read the tsv file
df = pd.read_csv('./cefr_ls/uchida_trimmed_parsed.tsv', sep='\t', header=None)

# create an empty df for the standardized format
final_df = pd.DataFrame()

# iterate over each three columns
for i in range(0, df.shape[1], 3):
    # select the three columns
    threecols_df = df.iloc[:, i:i+3]
    
    # reset the column names 
    threecols_df.columns = range(3)
    
    # append to the final df
    final_df = pd.concat([final_df, threecols_df])

# reset the index of the final df
final_df.reset_index(drop=True, inplace=True)

# remove rows where less than two cells are filled (e.g., as a word without a CEFR level or the other way around is useless)
final_df = final_df.dropna(thresh=2)

# remove duplicates based on the first and second column
final_df = final_df.drop_duplicates(subset=[0, 1])

# write the final df to a new tsv
final_df.to_csv('./cefr_ls/uchida_pos.tsv', sep='\t', header=False, index=False)


#### Map cefr levels to numerical values

In [62]:

# load the data
df = pd.read_csv('./cefr_ls/uchida_pos.tsv', sep='\t', header=None)

# define a mapping from CEFR levels to numerical values
cefr_level_mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

# apply the mapping to the third column
df[2] = df[2].map(cefr_level_mapping)

# save the updated df to a new tsv
df.to_csv('./cefr_all/uchida_num.tsv', sep='\t', header=False, index=False)


### Concatenate all CEFR datasets to get a higher coverage

In [66]:

# load the data
df1 = pd.read_csv('./cefr_all/uchida_num.tsv', sep='\t', header=None)
df2 = pd.read_csv('./cefr_all/cefrj_num.tsv', sep='\t', header=None)
df3 = pd.read_csv('./cefr_all/EFLLex_weighted.tsv', sep='\t', header=None)

# concatenate the df's
result_df = pd.concat([df1, df2, df3], ignore_index=True)

# save the concatenated df to a new tsv
result_df.to_csv('./cefr_all/cefr_all.tsv', sep='\t', index=False, header=False)




#### Remove 'full' duplicate rows

In [67]:

# load the combined data
df = pd.read_csv('./cefr_all/cefr_all.tsv', sep='\t', header=None)

# remove duplicate rows
df = df.drop_duplicates()

# save the resulting df to a new tsv
df.to_csv('./cefr_all/cefr_all_no_duplicates.tsv', sep='\t', index=False, header=False)


#### Group the data by the 'word' and 'pos' columns, and then calculate the average of the cefr_level_mapping column for each group

In [74]:

# load the data without 'full' duplicates
df = pd.read_csv('./cefr_all/cefr_all_no_duplicates.tsv', sep='\t', header=None)

# group by 'word' and 'pos' columns, and then calculate the average of the cefr_level_mapping column for each group
df_grouped = df.groupby([0, 1])[2].mean().reset_index()

# save the resulting df to a new tsv
df_grouped.to_csv('./cefr_all/cefr_all_combined.tsv', sep='\t', index=False, header=False)
