### Datafiles preprocessing

#### Removing unwanted chars from the trial and test sentences to prevent the code from crashing
and storing the results in the files:
- tsar2022_en_trial_none_no_noise.tsv 
- tsar2022_en_trial_gold_no_noise.tsv 
- tsar2022_en_test_none_no_noise.tsv
- tsar2022_en_test_gold_no_noise.tsv

##### trial dataset (for none and gold):

In [1]:
import pandas as pd
import re

# read the tsv file
filename = "./data/trial/tsar2022_en_trial_none.tsv"
data = pd.read_csv(filename, sep='\t', header=None, names=["sentence", "complex_word"])


# remove character combinations starting with # and optional quotes
pattern = r'#\d+-\d+(?: ")?'
data['sentence'] = data['sentence'].str.replace(pattern, '', regex=True)

# save the updated DataFrame to a new file
new_filename = "./data/trial/tsar2022_en_trial_none_no_noise.tsv"
data.to_csv(new_filename, sep='\t', header=False, index=False)

In [2]:
import pandas as pd
import re

# read the tsv file
filename = "./data/trial/tsar2022_en_trial_gold.tsv"

# define column names. Adjust this list depending on the actual column names in your file
col_names = ["sentence", "complex_word"] + [f"extra_col{i}" for i in range(1, 27)]

data = pd.read_csv(filename, sep='\t', header=None, names=col_names)

# eemove character combinations starting with # and optional quotes
pattern = r'#\d+-\d+(?: ")?'
data['sentence'] = data['sentence'].str.replace(pattern, '', regex=True)


# save the updated DataFrame to a new file
new_filename = "./data/trial/tsar2022_en_trial_gold_no_noise.tsv"
data.to_csv(new_filename, sep='\t', header=False, index=False)

##### test dataset (for none and gold):

In [3]:
import pandas as pd
import re

# read the tsv file
filename = "./data/test/tsar2022_en_test_none.tsv"
data = pd.read_csv(filename, sep='\t', header=None, names=["sentence", "complex_word"])


# remove character combinations starting with # and optional quotes
pattern = r'#\d+-\d+(?: ")?'
data['sentence'] = data['sentence'].str.replace(pattern, '', regex=True)

# save the updated dataFrame to a new file
new_filename = "./data/test/tsar2022_en_test_none_no_noise.tsv"
data.to_csv(new_filename, sep='\t', header=False, index=False)

In [4]:
import pandas as pd
import re

# read the tsv file
filename = "./data/test/tsar2022_en_test_gold.tsv"

# define column names. Adjust this list depending on the actual column names in your file
col_names = ["sentence", "complex_word"] + [f"extra_col{i}" for i in range(1, 27)]

data = pd.read_csv(filename, sep='\t', header=None, names=col_names)

# remove character combinations starting with # and optional quotes
pattern = r'#\d+-\d+(?: ")?'
data['sentence'] = data['sentence'].str.replace(pattern, '', regex=True)


# save the updated DataFrame to a new file
new_filename = "./data/test/tsar2022_en_test_gold_no_noise.tsv"
data.to_csv(new_filename, sep='\t', header=False, index=False)

### EFLLEX file

#### remove unwanted headers and columns from EFLLEX file:

In [1]:
import csv

with open('./cefr_efllex/EFLLex_NLP4J_ORIG.tsv', 'r', encoding='utf-8') as tsv_file:
    tsv_reader = csv.reader(tsv_file, delimiter='\t')
    next(tsv_reader, None)  # skip the headers
    rows = [row[:-106] for row in tsv_reader]  # remove the last 105 columns


with open('./cefr_efllex/EFLLex_trimmed.tsv', 'w', newline='', encoding='utf-8') as tsv_file:
    tsv_writer = csv.writer(tsv_file, delimiter='\t')
    tsv_writer.writerows(rows)

#### Option 1: assign most frequent cefr level to word in efflex dataset:

In [54]:
import pandas as pd
import numpy as np

# define column names
column_names = ['word', 'pos tag', 'A1', 'A2', 'B1', 'B2', 'C1']

# load tsv file
df = pd.read_csv('./cefr_efllex/EFLLex_trimmed.tsv', sep='\t', header=None, names=column_names)

# define CEFR levels for future reference
cefr_levels = ['A1', 'A2', 'B1', 'B2', 'C1',]

# extract column names of the max values and add the result to a new column
df['Highest CEFR'] = df[cefr_levels].idxmax(axis=1)

# create new dataframe with only the required columns
df_new = df[['word', 'pos tag', 'Highest CEFR']]


# write the new dataframe to a tsv file
df_new.to_csv('./cefr_efllex/EFLLex_mostfreq.tsv', sep='\t', header = False, index=False)


#### Option 2: take weighted average to assign cefr level to word in efflex dataset.:

In [55]:
import pandas as pd

# define column names
column_names = ['word', 'pos tag', 'A1', 'A2', 'B1', 'B2', 'C1']

# load tsv file
df = pd.read_csv('./cefr_efllex/EFLLex_trimmed.tsv', sep='\t', header=None, names=column_names)

# define CEFR levels for future reference
cefr_levels = ['A1', 'A2', 'B1', 'B2', 'C1']

# define mapping from CEFR levels to numbers
mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5}

# create a new dataframe to hold the weighted frequencies
df_weighted = pd.DataFrame()

# calculate the weighted frequencies for each CEFR level
for level in cefr_levels:
    df_weighted[level] = df[level] * mapping[level]

# sum the weighted frequencies across the CEFR levels for each word
df['Weighted Sum'] = df_weighted.sum(axis=1)

# sum the frequencies across the CEFR levels for each word
df['Frequency Sum'] = df[cefr_levels].sum(axis=1)

# calculate the weighted average for each word
df['Weighted CEFR'] = df['Weighted Sum'] / df['Frequency Sum']

# create new dataframe with only the required columns
df_new = df[['word', 'pos tag', 'Weighted CEFR']]

# write the new dataframe to a tsv file
df_new.to_csv('./cefr_efllex/EFLLex_weighted.tsv', sep='\t', header = False, index=False)


### CEFRJ file

#### concatenate cefrj files

In [None]:
import pandas as pd

In [None]:
# read the tsv files, skipping the header row
df1 = pd.read_csv('./cefrj/cefrj_1.csv', skiprows=1, header=None)
df2 = pd.read_csv('./cefrj/cefrj_2.csv', skiprows=1, header=None)

In [None]:
# select the first three columns
df1 = df1.iloc[:, :3]
df2 = df2.iloc[:, :3]

# concatenate the dataframes
df = pd.concat([df1, df2])

# write the combined data to a new tsv file
df.to_csv('./cefrj/cefrj_all.tsv', sep='\t', index=False, header=False)

#### map existing pos tags to Treebank tags

In [None]:
import csv

# map simple POS tags to Treebank tags
def map_to_treebank_tag(simple_tag):
    if simple_tag == 'adjective':
        return 'JJ'  # Adjective
    elif simple_tag == 'verb':
        return 'VB'  # Verb
    elif simple_tag == 'noun':
        return 'NN'  # Noun
    elif simple_tag == 'adverb':
        return 'RB'  # Adverb
    else:
        return None

# open input file
with open('./cefrj/cefrj_all.tsv', 'r') as infile:
    reader = csv.reader(infile, delimiter='\t')

    # open output file
    with open('./cefrj/cefrj_all_treebank.tsv', 'w') as outfile:
        writer = csv.writer(outfile, delimiter='\t')
        
        # loop over rows in the input file
        for row in reader:
            word = row[0]
            pos_tag = row[1]
            cefr_level = row[2]

            # map the POS tag to the Treebank format
            treebank_pos = map_to_treebank_tag(pos_tag)

            # if the tag was one of the four, write to the output file
            if treebank_pos is not None:
                writer.writerow([word, treebank_pos, cefr_level])

### Uchida file

#### remove the redundant columns from the uchida file

In [49]:
import pandas as pd

# load the dataframe
df = pd.read_csv('./cefr/uchida.tsv', sep='\t', header=None, usecols=range(24))

# remove columns 
remove_cols = [5, 8, 11, 14, 17, 20, 23]
df.drop(df.columns[remove_cols], axis=1, inplace=True)

# write the dataframe to a new tsv file
df.to_csv('./cefr/uchida_no_removed.tsv', sep='\t', header=False, index=False)




#### parse the sentences to get the pos tags of the words (for example, this is needed in case of same words with different CEFR levels based on their pos in the sentence)

In [50]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

# convert NLTK POS tags 
def convert_pos_tag(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'JJ'
    elif treebank_tag.startswith('V'):
        return 'VB'
    elif treebank_tag.startswith('N'):
        return 'NN'
    elif treebank_tag.startswith('R'):
        return 'RB'
    else:
        return ''

# read the tsv file
df = pd.read_csv('./cefr/uchida_no_removed.tsv', sep='\t', header=None)

# initialize a lemmatizer
lemmatizer = WordNetLemmatizer()

# define a new column for the POS tag
df[17] = ''

for index, row in df.iterrows():
    sentence = row[0]
    target_word = row[1]

    # tokenize the sentence
    words = word_tokenize(sentence)

    # get POS tags for the words
    tagged_words = pos_tag(words)

    # lemmatize words and compare with the target word
    for word, tag in tagged_words:
        tag = convert_pos_tag(tag)  # convert the POS tag
        lemma = lemmatizer.lemmatize(word, 'n')  # 'n' is used as a default POS if the word cannot be pos tagged
        if lemma == target_word and tag:
            df.at[index, 17] = tag

# filter the DataFrame to only include rows where the new POS column is not empty
df = df[df[17] != '']

# remove the first column
df = df.drop(columns=[0])

# insert the values from the last column to a new column after the first one
df.insert(1, 'POS', df[17])

# iterate through the positions and insert POS columns 
insert_positions = [4, 7, 10, 13, 16, 19, 22]
for i, pos in enumerate(insert_positions):
    df.insert(pos, f'POS_copy_{i+1}', df['POS'])

# remove the last column
df = df.drop(columns=[17])

# write the df to a new file
df.to_csv('./cefr/uchida_no_removed_parsed.tsv', sep='\t', header=False, index=False)

#### standardize format into three colums: word, pos, cefr level, and remove duplicates

In [51]:
import pandas as pd
import numpy as np

# read the tsv file
df = pd.read_csv('./cefr/uchida_no_removed_parsed.tsv', sep='\t', header=None)

# initialize an empty df
final_df = pd.DataFrame()

# iterate over each three colums
for i in range(0, df.shape[1], 3):
    # select the three columns
    threecols_df = df.iloc[:, i:i+3]
    
    # reset the column names 
    threecols_df.columns = range(3)
    
    # append to the final df
    final_df = pd.concat([final_df, threecols_df])

# reset the index of the final DataFrame
final_df.reset_index(drop=True, inplace=True)

# remove rows where less than two cells are filled (e.g., as a word without a CEFR level or the other way around is useless)
final_df = final_df.dropna(thresh=2)

# remove duplicates based on the first and second column
final_df = final_df.drop_duplicates(subset=[0, 1])

# write the final DataFrame to a new TSV file
final_df.to_csv('./cefr/uchida_pos.tsv', sep='\t', header=False, index=False)


### EVP file

In [None]:
#convert to tsv

import csv

with open('./cefr_evp/evp_american.csv', 'r', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    next(csv_reader, None)  # remove the headers
    rows = [row[:1] + row[2:-3] for row in csv_reader]  # remove the second column and last three columns


with open('./cefr_evp/evp_american.tsv', 'w', newline='', encoding='utf-8') as tsv_file:
    tsv_writer = csv.writer(tsv_file, delimiter='\t')
    tsv_writer.writerows(rows)
    
   

#### map evp pos tags to Treebank tags

In [10]:
import csv

# map simple POS tags to Treebank tags
def map_to_treebank_tag(simple_tag):
    if simple_tag == 'adjective':
        return 'JJ'  # Adjective
    elif simple_tag == 'verb':
        return 'VB'  # Verb
    elif simple_tag == 'noun':
        return 'NN'  # Noun
    elif simple_tag == 'adverb':
        return 'RB'  # Adverb
    else:
        return None

# open input file
with open('./cefr_evp/evp_american.tsv', 'r') as infile:
    reader = csv.reader(infile, delimiter='\t')

    # open output file
    with open('./cefr_evp/evp_american_treebank.tsv', 'w') as outfile:
        writer = csv.writer(outfile, delimiter='\t')
        
        # loop over rows in the input file
        for row in reader:
            word = row[0]
            pos_tag = row[1]
            cefr_level = row[2]

            # map the POS tag to the Treebank format
            treebank_pos = map_to_treebank_tag(pos_tag)

            # if the tag was one of the four POS categories, write to the output file
            if treebank_pos is not None:
                writer.writerow([word, treebank_pos, cefr_level])