#### Import necessary libraries 
##### and also the cefr-j wordlist (list of tuples) and lists of special tokens from .py files

In [14]:
import pandas as pd
from spacy.attrs import ORTH
import spacy
from cj_tuples import cj_wordlist
from special_tokens import compounds, hyphens, plurals

##### Load the spaCy model

In [4]:
nlp = spacy.load("en_core_web_sm")

#### Add special cases to the tokenizer
##### to tokenize compounds, hyphenated words and plural countable compound nouns (from the compounds and hyphenated words)
##### words with periods and most of the apostrophe words (except driver's license) were tokenized by spaCy anyway

In [5]:
special_case_1 = [{ORTH: "driver's license"}]
nlp.tokenizer.add_special_case("driver's license", special_case_1)

for compound in compounds:
    special_case_2 = [{ORTH: compound}]
    nlp.tokenizer.add_special_case(compound, special_case_2)

for hyphen in hyphens:
    special_case_3 = [{ORTH: hyphen}]
    nlp.tokenizer.add_special_case(hyphen, special_case_3)

for plural in plurals:
    special_case_4 = [{ORTH: plural}]
    nlp.tokenizer.add_special_case(plural, special_case_4)

#### Specify the POS tag for compound words
##### without this several of the compound words were given the wrong tag (e.g. PROPN)
##### so I specified that all occurences of a special token should have the POS tag specified in the CEFR-j list
##### there are 7 tokens that have 2 POS tags 'all right', 'upside down', 'full-time', 'grown-up', 'half-price', 'part-time', 'second-hand'
##### sometimes they will be given the wrong tag, but it won't affect the lexical coverage, as these tokens' POS tags are both at the same CEFR level 
##### (e.g. ('all right', 'ADJ', 'a1') ('all right', 'ADV', 'a1')
##### 
##### I also added an extra attribute ruler to tag ordinal numbers (e.g. 22nd) as NUM: before they were tagged as NOUN

In [6]:
ruler = nlp.get_pipe("attribute_ruler")

# this updates the POS of all compounds to that in the cefr-j list
# the tag is still the original spacy tag for the compound
# this is OK for me because I am only working with POS
# and it has the benefit of retaining information about which token's POS have been amended
for compound in compounds:
    for item in cj_wordlist:
        if compound == item[0]:
            compound_patterns = [[{"LOWER": compound}]]
            compound_attrs = {"POS": item[1]}
            ruler.add(patterns=compound_patterns,attrs=compound_attrs)

for hyphen in hyphens:
    for item in cj_wordlist:
        if hyphen == item[0]:
            hyphen_patterns = [[{"LOWER": hyphen}]]
            hyphen_attrs = {"POS": item[1]}
            ruler.add(patterns=hyphen_patterns,attrs=hyphen_attrs)
            
for plural in plurals:
    plural_patterns = [[{"LOWER": plural}]]
    plural_attrs = {"POS": "NOUN"}
    ruler.add(patterns=plural_patterns,attrs=plural_attrs)
    
number_patterns = [[{"LIKE_NUM": True}]]
number_attrs = {"POS": "NUM"}
ruler.add(patterns=number_patterns,attrs=number_attrs)

#### Read in a document from the 'world_news_720_df.csv' dataset (my YouTube Short News Corpus)
##### input a number up to 8286 (assigned to the text1 variable) to access one of the texts

In [15]:
df = pd.read_csv('world_news_720_df.csv')
texts = df['text']
# text1 = ""
text1 = texts[2520]
text1

"people in iran have been voting for a new president the country is facing big challenges from the pandemic and an economic crisis to soaring inflation and continuing u.s sanctions the winner is expected to be a hardline candidate who has close links to the country's supreme leader ayatollah khamenei ibrahim raichi is head of the judiciary and a religious conservative who has led crackdowns on popular protest and demands for reform many more moderate candidates have been barred from standing the election comes at a delicate time with hopes that iran will return to the deal that restricted its nuclear programme our middle east editor jeremy bowen reports elections in iran are not free or fair but they're a window into an opaque country with a repressive regime any resemblance to democracy is coincidental candidates are vetted in advance millions of frustrated iranians have stopped hoping that voting will improve their lives in the city of shiraz he was pulling down every election poster

#### Apply the spaCy pipeline to the text (tokenizer, POS tagger, lemmatizer...)
##### also create a no punctuation version of the doc after applying the pipeline
##### this is to calculate the lexical coverage (to avoid counting punctuation as tokens)
##### and print the tokens with POS tags and lemmas

In [16]:
doc = nlp(text1)
no_punc_doc = [token for token in doc if not token.is_punct]

for token in no_punc_doc:
    print(token, token.pos_, token.lemma_, token.tag_)

people NOUN people NNS
in ADP in IN
iran PROPN iran NNP
have AUX have VBP
been AUX be VBN
voting VERB vote VBG
for ADP for IN
a DET a DT
new ADJ new JJ
president NOUN president NN
the DET the DT
country NOUN country NN
is AUX be VBZ
facing VERB face VBG
big ADJ big JJ
challenges NOUN challenge NNS
from ADP from IN
the DET the DT
pandemic ADJ pandemic JJ
and CCONJ and CC
an DET an DT
economic ADJ economic JJ
crisis NOUN crisis NN
to ADP to IN
soaring VERB soar VBG
inflation NOUN inflation NN
and CCONJ and CC
continuing VERB continue VBG
u.s PROPN u.s NNP
sanctions NOUN sanction NNS
the DET the DT
winner NOUN winner NN
is AUX be VBZ
expected VERB expect VBN
to PART to TO
be VERB be VB
a DET a DT
hardline ADJ hardline JJ
candidate NOUN candidate NN
who PRON who WP
has VERB have VBZ
close ADJ close JJ
links NOUN link NNS
to ADP to IN
the DET the DT
country NOUN country NN
's PART 's POS
supreme PROPN supreme NNP
leader PROPN leader NNP
ayatollah PROPN ayatollah NNP
khamenei PROPN khamenei 

#### Assign the tokens to lists
##### Create an open list to store each level of word
##### If the token is a proper noun or number, add it to the propn_num list
##### Check if the token and pos match a tuple in the cefr-j list of tuples, and if they do, add it to the appropriate cefr level list
##### If none of the conditions are met, append the 'others' list


In [17]:
propn = []
num = []
a1_words = []
a1_words_pos = []
a2_words = []
a2_words_pos = []
b1_words = []
b1_words_pos = []
b2_words = []
b2_words_pos = []
others = []
others_pos = []

for token in no_punc_doc:
    for item in cj_wordlist:
        if token.pos_ == "PROPN":
            propn.append(token.lemma_)
            break
        elif token.pos_ == "NUM":
            num.append(token.lemma_)
            break
        elif token.lemma_ == item[0] and token.pos_ == item[1] and item[2] == 'a1':
            a1_words.append(token.lemma_)
            a1_words_pos.append(token.pos_)
            break
        elif token.lemma_ == item[0] and token.pos_ == item[1] and item[2] == 'a2':
            a2_words.append(token.lemma_)
            a2_words_pos.append(token.pos_)
            break
        elif token.lemma_ == item[0] and token.pos_ == item[1] and item[2] == 'b1':
            b1_words.append(token.lemma_)
            b1_words_pos.append(token.pos_)
            break
        elif token.lemma_ == item[0] and token.pos_ == item[1] and item[2] == 'b2':
            b2_words.append(token.lemma_)
            b2_words_pos.append(token.pos_)
            break
    else:
        others.append(token.lemma_)
        others_pos.append(token.pos_)

#### Calculate the cumulative lexical coverage
##### convert the results to a pandas dataframe and display it

In [18]:
propn_percent = len(propn) / len(no_punc_doc) * 100
num_percent = len(num) / len(no_punc_doc) * 100 + propn_percent
a1_percent = len(a1_words) / len(no_punc_doc) * 100 + num_percent
a2_percent = len(a2_words) / len(no_punc_doc) * 100 + a1_percent
b1_percent = len(b1_words) / len(no_punc_doc) * 100 + a2_percent

b2_percent = len(b2_words) / len(no_punc_doc) * 100 + b1_percent
others_percent = len(others) / len(no_punc_doc) * 100 + b2_percent

data = {'PROPN': [propn_percent], 'NUM': [num_percent], 'A1': [a1_percent], 'A2': [a2_percent], 'B1': [b1_percent], 
        'B2': [b2_percent], 'others': [others_percent]}

df2 = pd.DataFrame(data)
df2

Unnamed: 0,PROPN,NUM,A1,A2,B1,B2,others
0,9.73236,10.462287,66.423358,76.642336,87.347932,92.70073,100.0


#### Check which lemmas were appended to which list

In [19]:
propn

['iran',
 'u.s',
 'supreme',
 'leader',
 'ayatollah',
 'khamenei',
 'ibrahim',
 'raichi',
 'iran',
 'middle',
 'east',
 'editor',
 'jeremy',
 'bowen',
 'iran',
 'iranians',
 'shiraz',
 'amir',
 'hussein',
 'ibrahim',
 'iran',
 'ayatollah',
 'ali',
 'khamenei',
 'supreme',
 'leader',
 'abu',
 'nasa',
 'hemati',
 'iran',
 'iran',
 'ibrahim',
 'risi',
 'iran',
 'iran',
 'president',
 'jeremy',
 'bowen',
 'bbc',
 'news']

In [20]:
num

['2019', 'one', '82']

In [21]:
list(zip(a1_words, a1_words_pos))

[('people', 'NOUN'),
 ('in', 'ADP'),
 ('have', 'AUX'),
 ('be', 'AUX'),
 ('for', 'ADP'),
 ('a', 'DET'),
 ('new', 'ADJ'),
 ('the', 'DET'),
 ('be', 'AUX'),
 ('big', 'ADJ'),
 ('from', 'ADP'),
 ('the', 'DET'),
 ('and', 'CCONJ'),
 ('an', 'DET'),
 ('to', 'ADP'),
 ('and', 'CCONJ'),
 ('the', 'DET'),
 ('be', 'AUX'),
 ('to', 'PART'),
 ('be', 'VERB'),
 ('a', 'DET'),
 ('who', 'PRON'),
 ('have', 'VERB'),
 ('close', 'ADJ'),
 ('to', 'ADP'),
 ('the', 'DET'),
 ("'s", 'PART'),
 ('be', 'VERB'),
 ('head', 'NOUN'),
 ('of', 'ADP'),
 ('the', 'DET'),
 ('and', 'CCONJ'),
 ('a', 'DET'),
 ('who', 'PRON'),
 ('have', 'AUX'),
 ('on', 'ADP'),
 ('and', 'CCONJ'),
 ('for', 'ADP'),
 ('many', 'ADJ'),
 ('have', 'AUX'),
 ('be', 'AUX'),
 ('from', 'ADP'),
 ('the', 'DET'),
 ('come', 'VERB'),
 ('at', 'ADP'),
 ('a', 'DET'),
 ('time', 'NOUN'),
 ('with', 'ADP'),
 ('hope', 'NOUN'),
 ('that', 'SCONJ'),
 ('will', 'AUX'),
 ('to', 'ADP'),
 ('the', 'DET'),
 ('that', 'DET'),
 ('its', 'PRON'),
 ('programme', 'NOUN'),
 ('our', 'PRON'),
 ('i

In [34]:
list(zip(a2_words, a2_words_pos))

[('understand', 'VERB'),
 ('importance', 'NOUN'),
 ('consequence', 'NOUN'),
 ('international', 'ADJ'),
 ('drill', 'NOUN'),
 ('honor', 'NOUN'),
 ('pilot', 'NOUN'),
 ('captain', 'NOUN'),
 ('explain', 'VERB'),
 ('exactly', 'ADV'),
 ('purpose', 'NOUN'),
 ('drill', 'NOUN'),
 ('chance', 'NOUN'),
 ('exactly', 'ADV'),
 ('yeah', 'INTJ'),
 ('advantage', 'NOUN'),
 ('pilot', 'NOUN'),
 ('mistake', 'NOUN'),
 ('disadvantage', 'NOUN'),
 ('pro', 'NOUN'),
 ('yeah', 'INTJ'),
 ('understand', 'VERB'),
 ('next', 'ADJ'),
 ('generation', 'NOUN'),
 ('pilot', 'NOUN'),
 ('all', 'ADV'),
 ('country', 'NOUN'),
 ('yeah', 'INTJ'),
 ('pilot', 'NOUN'),
 ('understand', 'VERB'),
 ('train', 'VERB'),
 ('explain', 'VERB'),
 ('advantage', 'NOUN'),
 ('strike', 'NOUN'),
 ('attack', 'NOUN'),
 ('such', 'ADJ'),
 ('advantage', 'NOUN'),
 ('advanced', 'ADJ'),
 ('face', 'VERB'),
 ('face', 'VERB'),
 ('battery', 'NOUN'),
 ('probably', 'ADV'),
 ('face', 'VERB'),
 ('training', 'NOUN'),
 ('exactly', 'ADV'),
 ('training', 'NOUN'),
 ('train

In [35]:
list(zip(b1_words, b1_words_pos))

[('responsibility', 'NOUN'),
 ('strike', 'VERB'),
 ('various', 'ADJ'),
 ('strategic', 'ADJ'),
 ('threat', 'NOUN'),
 ('air force', 'NOUN'),
 ('responsibility', 'NOUN'),
 ('prepared', 'ADJ'),
 ('capable', 'ADJ'),
 ('mission', 'NOUN'),
 ('require', 'VERB'),
 ('main', 'ADJ'),
 ('more', 'ADV'),
 ('entire', 'ADJ'),
 ('maximum', 'ADJ'),
 ('anti', 'ADJ'),
 ('anti', 'ADJ'),
 ('aware', 'ADJ'),
 ('surround', 'VERB'),
 ('awareness', 'NOUN'),
 ('threat', 'NOUN'),
 ('northern', 'ADJ'),
 ('region', 'NOUN'),
 ('border', 'NOUN'),
 ('border', 'NOUN'),
 ('react', 'VERB'),
 ('enemy', 'NOUN'),
 ('used to', 'AUX'),
 ('air force', 'NOUN'),
 ('prepared', 'ADJ'),
 ('enemy', 'NOUN')]

In [36]:
list(zip(b2_words, b2_words_pos))

[('privilege', 'NOUN'),
 ('capability', 'NOUN'),
 ('aircraft', 'NOUN'),
 ('mine', 'NOUN'),
 ('official', 'NOUN'),
 ('privilege', 'NOUN'),
 ('learning', 'NOUN'),
 ('politically', 'ADV')]

In [22]:
list(zip(others, others_pos))

[('sanction', 'NOUN'),
 ('hardline', 'ADJ'),
 ('judiciary', 'NOUN'),
 ('conservative', 'NOUN'),
 ('crackdown', 'NOUN'),
 ('opaque', 'ADJ'),
 ('repressive', 'ADJ'),
 ('resemblance', 'NOUN'),
 ('coincidental', 'ADJ'),
 ('vet', 'VERB'),
 ('million', 'NOUN'),
 ('voting', 'NOUN'),
 ('plea', 'NOUN'),
 ('boycott', 'VERB'),
 ('hardliner', 'NOUN'),
 ('raisi', 'VERB'),
 ('judiciary', 'NOUN'),
 ('past', 'NOUN'),
 ('bleak', 'ADJ'),
 ('iranian', 'ADJ'),
 ('reformist', 'NOUN'),
 ('1980', 'NOUN'),
 ('raisi', 'NOUN'),
 ('prosecutor', 'NOUN'),
 ('pinnacle', 'NOUN'),
 ('hardliner', 'NOUN'),
 ('reformist', 'NOUN'),
 ('sanction', 'NOUN'),
 ('cue', 'NOUN'),
 ('iranian', 'ADJ')]