#### Import necessary libraries 
##### and also the cefr-j wordlist (list of tuples) and lists of special tokens from .py files

In [2]:
import re
import pandas as pd
from spacy.attrs import ORTH
import spacy
from cj_tuples import cj_wordlist
from special_tokens import compounds, hyphens, plurals

##### Load the spaCy model

In [3]:
nlp = spacy.load("en_core_web_sm")

#### Add special cases to the tokenizer
##### to tokenize compounds, hyphenated words and plural countable compound nouns (from the compounds and hyphenated words)
##### words with periods and most of the apostrophe words (except driver's license) were tokenized by spaCy anyway

In [4]:
special_case_1 = [{ORTH: "driver's license"}]
nlp.tokenizer.add_special_case("driver's license", special_case_1)

for compound in compounds:
    special_case_2 = [{ORTH: compound}]
    nlp.tokenizer.add_special_case(compound, special_case_2)

for hyphen in hyphens:
    special_case_3 = [{ORTH: hyphen}]
    nlp.tokenizer.add_special_case(hyphen, special_case_3)

for plural in plurals:
    special_case_4 = [{ORTH: plural}]
    nlp.tokenizer.add_special_case(plural, special_case_4)

#### Specify the POS tag for compound words
##### without this several of the compound words were given the wrong tag (e.g. PROPN)
##### so I specified that all occurences of a special token should have the POS tag specified in the CEFR-j list
##### there are 7 tokens that have 2 POS tags 'all right', 'upside down', 'full-time', 'grown-up', 'half-price', 'part-time', 'second-hand'
##### sometimes they will be given the wrong tag, but it won't affect the lexical coverage, as these tokens' POS tags are both at the same CEFR level 
##### (e.g. ('all right', 'ADJ', 'a1') ('all right', 'ADV', 'a1')


In [5]:
ruler = nlp.get_pipe("attribute_ruler")

# this updates the POS of all compounds to that in the cefr-j list
# the tag is still the original spacy tag for the compound
# this is OK for me because I am only working with POS
# and it has the benefit of retaining information about which token's POS have been amended
for compound in compounds:
    for item in cj_wordlist:
        if compound == item[0]:
            compound_patterns = [[{"LOWER": compound}]]
            compound_attrs = {"POS": item[1]}
            ruler.add(patterns=compound_patterns,attrs=compound_attrs)

for hyphen in hyphens:
    for item in cj_wordlist:
        if hyphen == item[0]:
            hyphen_patterns = [[{"LOWER": hyphen}]]
            hyphen_attrs = {"POS": item[1]}
            ruler.add(patterns=hyphen_patterns,attrs=hyphen_attrs)
            
for plural in plurals:
    plural_patterns = [[{"LOWER": plural}]]
    plural_attrs = {"POS": "NOUN"}
    ruler.add(patterns=plural_patterns,attrs=plural_attrs)
    
number_patterns = [[{"LIKE_NUM": True}]]
number_attrs = {"POS": "NUM"}
ruler.add(patterns=number_patterns,attrs=number_attrs)

#### Specify the channel for the transcripts you want to assess the lexical coverage of
##### it's better to do it in batches like this, because the processing time is long

In [6]:
channel_in_df = 'WION (IN)'
channel_for_output = 'WION'

In [7]:
df = pd.read_csv('world_news_720_df.csv')
df = df[df['channel'] == channel_in_df] 
# df = df[0:6]
texts = df['text']
texts

7566    let's begin with turkey where another economic...
7567    and here's the story for science buffs the hea...
7568    our next story is from iran which continues to...
7569    let's shift our attention over to south korea ...
7570    ramayana and mahabharata two of the greatest e...
                              ...                        
8281    u.s authorities arrested the wife of jailed me...
8282    and countries across the globe have been react...
8283    archaeologists have hailed a latest discovery ...
8284    and the u.s troops withdrawal from afghanistan...
8285    and in a bid to accelerate the vaccination rol...
Name: text, Length: 720, dtype: object

#### Apply the spaCy pipeline to the text (tokenizer, POS tagger, lemmatizer...)
##### also create a no punctuation version of the doc after applying the pipeline
##### this is to calculate the lexical coverage (to avoid counting punctuation as tokens)
##### and print the tokens with POS tags and lemmas

#### Assign the tokens to lists
##### all_level_dicts is for an open list of dictionaries
##### for each text, create a dictionary with an open list for each level (propn_num, A1, A2...)
##### then append each dictionary to all_level_dicts
##### proper nouns and numbers are added to the propn_num list first
##### Check if the token and pos match a tuple in the cefr-j list of tuples, and if they do, add it to the appropriate cefr level list
##### If none of the conditions are met, append the 'others' list
##### including 'break' was necessary to append others to the others list with the else statement
### Also added in extra lists for pos tags


In [65]:
all_level_dicts = []

for i, text in enumerate(texts):
    level_dict = {'Text': [], 'doc_length': [], 'PROPN': [], 'NUM': [], 'A1': [], 'A1_pos': [], 'A2': [], 'A2_pos': [], 
                  'B1': [], 'B1_pos': [], 'B2': [], 'B2_pos': [], 'others': [], 'others_pos': []}
    level_dict['Text'].append(i)
    doc = nlp(text)
    no_punc_doc = [token for token in doc if not token.is_punct]
    level_dict['doc_length'].append(len(no_punc_doc))

    for token in no_punc_doc:
        for item in cj_wordlist:
            if token.pos_ == "PROPN":
                level_dict['PROPN'].append(token.lemma_)
                break
            elif token.pos_ == "NUM":
                level_dict['NUM'].append(token.lemma_)
                break
            elif token.lemma_ == item[0] and token.pos_ == item[1] and item[2] == 'a1':
                level_dict['A1'].append(token.lemma_)
                level_dict['A1_pos'].append(token.pos_)
                break
            elif token.lemma_ == item[0] and token.pos_ == item[1] and item[2] == 'a2':
                level_dict['A2'].append(token.lemma_)
                level_dict['A2_pos'].append(token.pos_)
                break
            elif token.lemma_ == item[0] and token.pos_ == item[1] and item[2] == 'b1':
                level_dict['B1'].append(token.lemma_)
                level_dict['B1_pos'].append(token.pos_)
                break
            elif token.lemma_ == item[0] and token.pos_ == item[1] and item[2] == 'b2':
                level_dict['B2'].append(token.lemma_)
                level_dict['B2_pos'].append(token.pos_)
                break
        else:
            level_dict['others'].append(token.lemma_)
            level_dict['others_pos'].append(token.pos_)

            
    all_level_dicts.append(level_dict)

#### Create a dataframe with the channel name, video_id, doc_length, lists of PROPN and NUM
#### Then lists of tuples (word, POS) for A1-B2 words and others

In [66]:
dict_df = pd.DataFrame(all_level_dicts)
dict_df.insert(1, 'channel', channel_in_df)
# add the video_id column from the original filtered df (from this channel)
dict_df.insert(2, 'video_id', df['video_id'])
dict_df2 = dict_df.drop(columns=['Text'])

# https://stackoverflow.com/questions/56714623/zip-list-elements-in-different-dataframe-columns

dict_df2['A1+pos'] = dict_df2.apply(lambda x: list(zip(x.A1, x.A1_pos)), axis=1)
dict_df2['A2+pos'] = dict_df2.apply(lambda x: list(zip(x.A2, x.A2_pos)), axis=1)
dict_df2['B1+pos'] = dict_df2.apply(lambda x: list(zip(x.B1, x.B1_pos)), axis=1)
dict_df2['B2+pos'] = dict_df2.apply(lambda x: list(zip(x.B2, x.B2_pos)), axis=1)
dict_df2['others+pos'] = dict_df2.apply(lambda x: list(zip(x.others, x.others_pos)), axis=1)

dict_df3 = dict_df2.drop(columns=['A1', 'A1_pos', 'A2', 'A2_pos', 'B1', 'B1_pos', 'B2', 'B2_pos', 'others', 'others_pos'])

dict_df3

dict_df3.to_csv('LC_Output/words/'+channel_for_output+'_cefr_words.csv', encoding='utf8', index=False)

#### Calculate the cumulative lexical coverage
##### convert the results to a pandas dataframe and display it
##### output the file to csv

In [67]:
doc_lengths = [item['doc_length'] for item in all_level_dicts]
doc_length_flat = [item for sublist in doc_lengths for item in sublist]

propn_freq = []
propn_lists = [item['PROPN'] for item in all_level_dicts]
propn_freq.append([len(item) for item in propn_lists])
propn_flat_freq = [item for sublist in propn_freq for item in sublist]
propn_decimal = [int(freq) / int(length) for freq,length in zip(propn_flat_freq, doc_length_flat)]
propn_percent = [i * 100 for i in propn_decimal]

num_freq = []
num_lists = [item['NUM'] for item in all_level_dicts]
num_freq.append([len(item) for item in num_lists])
num_flat_freq = [item for sublist in num_freq for item in sublist]
num_decimal = [int(freq) / int(length) for freq,length in zip(num_flat_freq, doc_length_flat)]
num_percent = [i * 100 for i in num_decimal]

a1_freq = []
a1_lists = [item['A1'] for item in all_level_dicts]
a1_freq.append([len(item) for item in a1_lists])
a1_flat_freq = [item for sublist in a1_freq for item in sublist]
a1_decimal = [int(freq) / int(length) for freq,length in zip(a1_flat_freq, doc_length_flat)]
a1_percent = [i * 100 for i in a1_decimal]

a2_freq = []
a2_lists = [item['A2'] for item in all_level_dicts]
a2_freq.append([len(item) for item in a2_lists])
a2_flat_freq = [item for sublist in a2_freq for item in sublist]
a2_decimal = [int(freq) / int(length) for freq,length in zip(a2_flat_freq, doc_length_flat)]
a2_percent = [i * 100 for i in a2_decimal]

b1_freq = []
b1_lists = [item['B1'] for item in all_level_dicts]
b1_freq.append([len(item) for item in b1_lists])
b1_flat_freq = [item for sublist in b1_freq for item in sublist]
b1_decimal = [int(freq) / int(length) for freq,length in zip(b1_flat_freq, doc_length_flat)]
b1_percent = [i * 100 for i in b1_decimal]

b2_freq = []
b2_lists = [item['B2'] for item in all_level_dicts]
b2_freq.append([len(item) for item in b2_lists])
b2_flat_freq = [item for sublist in b2_freq for item in sublist]
b2_decimal = [int(freq) / int(length) for freq,length in zip(b2_flat_freq, doc_length_flat)]
b2_percent = [i * 100 for i in b2_decimal]

others_freq = []
others_lists = [item['others'] for item in all_level_dicts]
others_freq.append([len(item) for item in others_lists])
others_flat_freq = [item for sublist in others_freq for item in sublist]
others_decimal = [int(freq) / int(length) for freq,length in zip(others_flat_freq, doc_length_flat)]
others_percent = [i * 100 for i in others_decimal]

data = {'propn%': propn_percent, 'num%': num_percent, 'A1%': a1_percent, 'A2%': a2_percent, 
        'B1%': b1_percent, 'B2%': b2_percent, 'others%': others_percent}

percent_df = pd.DataFrame(data)

percent_df['channel'] = channel_in_df
# add the video_id column from the original filtered df (from this channel)
percent_df['video_id'] = df['video_id']
percent_df['propn%_cum'] = percent_df['propn%']
percent_df['num%_cum'] = percent_df['propn%_cum'] + percent_df['num%']
percent_df['A1%_cum'] = percent_df['num%_cum'] + percent_df['A1%']
percent_df['A2%_cum'] = percent_df['A1%_cum'] + percent_df['A2%']
percent_df['B1%_cum'] = percent_df['A2%_cum'] + percent_df['B1%']
percent_df['B2%_cum'] = percent_df['B1%_cum'] + percent_df['B2%']
percent_df['others%_cum'] = percent_df['B2%_cum'] + percent_df['others%']

percent_df2 = percent_df.drop(columns=['propn%', 'num%', 'A1%', 'A2%','B1%', 'B2%', 'others%'])

percent_df2

percent_df2.to_csv('LC_Output/'+channel_for_output+'_cefr_LC.csv', encoding='utf8', index=False)