#### Import cefr-j wordlist (the format is a list of tuples)
##### and pandas library

In [2]:
from cj_tuples import cj_wordlist
import pandas as pd

#### For Adjectives (in 3 steps)
#### Read in the CSV

In [3]:
df = pd.read_csv('World_adj_frequency.csv')
df

Unnamed: 0,word,frequency,per_million,doc_freq,doc_percentage
0,more,7299,2558.110356,4060,48.998310
1,new,4935,1729.589616,2736,33.019551
2,other,4020,1408.905827,2792,33.695390
3,many,3677,1288.693215,2426,29.278301
4,first,3560,1247.687747,2429,29.314506
...,...,...,...,...,...
7321,abortive,1,0.350474,1,0.012069
7322,ir,1,0.350474,1,0.012069
7323,yaer,1,0.350474,1,0.012069
7324,botanical,1,0.350474,1,0.012069


#### Make a list of adjectives at the A1 and A2 level in the CEFR-J wordlist
#### item[0] = the word, item[1] = the pos, item[2] = the CEFR level (in the list of tuples)

In [4]:
a1_a2_adj = []
for item in cj_wordlist:
    if item[1] == 'ADJ' and item[2] == 'a1':
        a1_a2_adj.append(item[0])
    if item[1] == 'ADJ' and item[2] == 'a2':
        a1_a2_adj.append(item[0])
        
a1_a2_adj

['OK',
 'ok',
 'Olympic',
 'olympic',
 'acceptable',
 'additional',
 'adult',
 'advanced',
 'afraid',
 'aged',
 'alive',
 'all right',
 'alone',
 'alright',
 'amused',
 'ancient',
 'angry',
 'annoying',
 'anxious',
 'appropriate',
 'armed',
 'artificial',
 'asleep',
 'assistant',
 'attractive',
 'audio',
 'automatic',
 'average',
 'awake',
 'awful',
 'back',
 'bad',
 'basic',
 'beautiful',
 'best',
 'better',
 'big',
 'black',
 'blank',
 'blonde',
 'blue',
 'boiled',
 'bored',
 'boring',
 'brave',
 'bright',
 'brilliant',
 'broken',
 'brown',
 'busy',
 'careful',
 'certain',
 'cheap',
 'chemical',
 'classic',
 'clean',
 'clear',
 'clever',
 'close',
 'closed',
 'cloudy',
 'cold',
 'colorful',
 'colourful',
 'comfortable',
 'comic',
 'common',
 'complete',
 'confident',
 'confused',
 'convenient',
 'cool',
 'correct',
 'crazy',
 'cream',
 'creative',
 'crowded',
 'cute',
 'daily',
 'dangerous',
 'dark',
 'dead',
 'dear',
 'delicious',
 'developed',
 'different',
 'difficult',
 'direct',

#### Create a new dataframe, deleting any of the A1 and A2 level adjectives

In [15]:
for word in a1_a2_adj:  
    df = df[df['word'].str.contains(word)==False]
    
df.to_csv('World_adj_B1_plus.csv', encoding='utf8')

#### Follow the same process for other parts of speech
##### Nouns:


In [22]:
df = pd.read_csv('World_noun_frequency.csv')

a1_a2_noun = []
for item in cj_wordlist:
    if item[1] == 'NOUN' and item[2] == 'a1':
        a1_a2_noun.append(item[0])
    if item[1] == 'NOUN' and item[2] == 'a2':
        a1_a2_noun.append(item[0])

# filter out words that are in the a1_a2 list
for word in a1_a2_noun:
    df = df[df['word'].str.contains(word)==False]
    # for nouns one 'word' was a hyphen, this is to filter out the hyphen
    df = df[df['word'].str.isalpha()]

# filter out 'one' and 'other'
# after manually reviewing the list, they might be difficult to identify as nouns by learners using the ShinyConc concordancer
df = df[df['word'].str.contains('one|other')==False]

df.to_csv('World_noun_B1_plus.csv', encoding='utf8')

##### Verbs:

In [25]:
df = pd.read_csv('World_verb_frequency.csv')

a1_a2_verb = []
for item in cj_wordlist:
    if item[1] == 'VERB' and item[2] == 'a1':
        a1_a2_verb.append(item[0])
    if item[1] == 'VERB' and item[2] == 'a2':
        a1_a2_verb.append(item[0])

# filter out words that are in the a1_a2 list
for word in a1_a2_verb:
    df = df[df['word'].str.contains(word)==False]
    # for verbs one 'word' was a hyphen, this is to filter out the hyphen
    df = df[df['word'].str.isalpha()]

# filter 'accord' out (after manually reviewing the list), accord is the lemma of 'according' 
# mainly 'according to', which isn't a verb, I didn't tokenize any compounds (like I did for lexical coverage) for the frequency lists
# so according was incorrectly tagged as a verb, when 'according to' should be a preposition (ADP)
df = df[df['word'].str.contains('accord')==False]
    
df.to_csv('World_verb_B1_plus.csv', encoding='utf8')