In [1]:
import spacy
import pandas as pd
from pathlib import Path
from itertools import chain
from collections import Counter
import pickle
import os
import requests

In [2]:
curr_dir = Path(os.getcwd()).parent
df_train = pd.read_csv(f"{curr_dir}/Data/train.csv")

In [3]:
"""
install en_core_web_sm model from spacy, if not, run in terminal in current venv
python -m spacy download en_core_web_sm
"""
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
docs_train = [
    [tkn.lemma_.lower() for tkn in nlp(doc) if (not tkn.is_stop) & (tkn.is_alpha)]
    for doc in nlp.pipe(df_train["text"])
]
df_train["token"] = docs_train

In [4]:
token_count_dict = Counter(chain.from_iterable(docs_train))
token_count_dict = {
    k: v for k, v in sorted(token_count_dict.items(), key = lambda item: item[1])
}

In [5]:
for k, v in token_count_dict.items():
    print(k, v)

drinksble 1
drinkabitity 1
gaggingly 1
bulldoze 1
skateboards 1
beerc 1
narke 1
unyielding 1
popskull 1
wrigley 1
murkbro 1
trustee 1
thew 1
whif 1
hombre 1
castillo 1
caduca 1
semana 1
correcta 1
respecto 1
mejore 1
exponente 1
alarmingly 1
lamost 1
kickstart 1
deeeppp 1
verily 1
sheaf 1
presencez 1
omiyage 1
bsd 1
mislabele 1
halen 1
eddie 1
roadie 1
sweeetness 1
rooftop 1
cayman 1
breezes 1
conch 1
linguistically 1
champagney 1
chugalug 1
eulogy 1
proliferate 1
bom 1
flocculent 1
misguided 1
instal 1
hooligans 1
spitfires 1
asahi 1
misheard 1
litel 1
stewy 1
pastilles 1
twister 1
graniteskunk 1
yoke 1
hammondsport 1
comments 1
ottauquechee 1
rt 1
quechee 1
unflattering 1
minny 1
complexed 1
identicalo 1
boch 1
indeterminable 1
handbottle 1
subservient 1
drinklability 1
misshapen 1
docent 1
playboy 1
nickles 1
toastedness 1
deere 1
der 1
alte 1
roatstiness 1
birkinau 1
bourne 1
cinnsanity 1
fpbc 1
snca 1
piddly 1
lemonx 1
versitility 1
merriment 1
refresing 1
skyroc 1
eqsque 1
bcaa 1

We can see the frequencies of the words across the entire corpus; <br>
We trim down some of ultra high/low frequency words so that classfications would be most relevant. 

In [6]:
exclude_word = {
    k for k, v in token_count_dict.items() if v <= 5 or v >= 13_000
}

url = "https://raw.githubusercontent.com/hyperreality/American-British-English-Translator/master/data/british_spellings.json"
british_to_american_dict = requests.get(url).json()

def americanize(string):
    for british_spelling, american_spelling in british_to_american_dict.items():
        string = string.replace(british_spelling, american_spelling)
    return string

clean_list = [
    americanize(' '.join([tkn for tkn in doc if tkn not in exclude_word])).split()
    for doc in df_train["token"]
]

In [7]:
output_df = df_train[["label"]]
output_df["token"] = clean_list
output_df['text'] = output_df['token'].apply(lambda tkn: " ".join(tkn))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_df["token"] = clean_list


In [8]:
with open(f'{curr_dir}/Data/train_processed', "wb") as f:
    pickle.dump(output_df, f)