In [31]:
# Import relevant libraries
import pandas as pd
import spacy
from collections import Counter
from tqdm.notebook import tqdm

In [32]:
# Load train dataset into dataframe
df_train = pd.read_csv('train.csv')

In [33]:
# Setup spaCy language model. The large language model was chosen to ensure maximum number of words were tokenised
nlp = spacy.load("en_core_web_lg")

In [34]:
# Adding lemmas of the word to reduce complexity of the data
# Additionally, stopwords were excluded and only alphabetical words were included
docs_train = [[tkn.lemma_.lower() for tkn in nlp(doc) if (not tkn.is_stop) & (tkn.is_alpha)] for doc in tqdm(df_train.text)]

  0%|          | 0/21057 [00:00<?, ?it/s]

In [35]:
# Inserting the tokens into a separate dataframe
df_train_1 = df_train
df_train_1['Tokens'] = docs_train
df_train_1.head()

Unnamed: 0,label,text,Tokens
0,0,Batch #5\n\nAppearance: Pours a slightly hazy ...,"[batch, appearance, pour, slightly, hazy, aubu..."
1,0,Murky peach color with off-white head. Aroma h...,"[murky, peach, color, white, head, aroma, tart..."
2,0,Can poured into a Spiegelau IPA glass\n\nA: Po...,"[pour, spiegelau, ipa, glass, pour, golden, am..."
3,0,A big thanks to Jeff for this one. 750ml cappe...,"[big, thank, jeff, cap, bottle, brooklyn, brew..."
4,0,On tap into a shaker pint.\n\nAppearance is go...,"[tap, shaker, pint, appearance, golden, amber,..."


In [37]:
# Collect the total list of words into train_list
train_list = []
for lst in df_train_1['Tokens']:
    for tkn in lst:
        train_list.append(tkn)

# Count the frequency of the words across the entire corpus
train_count = Counter(train_list)
print(train_count)

# Find words to exclude from the corpus. Words occurrring more than 7730 times and fewer than or equal to 5 times were removed
# This was done to ensure that the approaches used to classify the reviews would only use the most relevant words
words_to_exclude = []
for word, count in train_count.items():
    if (count <= 5) or (count > 7730):
        words_to_exclude.append(word)

# A peek into the words being excluded
print(words_to_exclude)
print(len(words_to_exclude))

['pour', 'head', 'smell', 'taste', 'aroma', 'finish', 'light', 'carbonation', 'sweet', 'hop', 'malt', 'nice', 'beer', 'flavor', 'good', 'drinksble', 'shortbread', 'like', 'fisish', 'ringer', 'dark', 'drinkabitity', 'gaggingly', 'whitehead', 'shovel', 'conquer', 'bulldoze', 'legitimately', 'skateboards', 'beerc', 'reall', 'asset', 'narke', 'unyielding', 'hawk', 'mendocino', 'popskull', 'wrigley', 'murkbro', 'trustee', 'percolate', 'thew', 'tied', 'hermitage', 'jose', 'whif', 'combined', 'landbierparadies', 'viendo', 'hombre', 'castillo', 'caduca', 'semana', 'correcta', 'aunque', 'falta', 'mayor', 'intensidad', 'sabor', 'respecto', 'mejore', 'exponente', 'estilo', 'buen', 'eso', 'si', 'chilling', 'alarmingly', 'lamost', 'uncanny', 'defeat', 'underpowered', 'kickstart', 'resembles', 'chuck', 'mitch', 'deeeppp', 'colma', 'lucid', 'scents', 'ornate', 'iodine', 'verily', 'cuisine', 'sheaf', 'fondness', 'winy', 'presencez', 'popeye', 'omiyage', 'obligatory', 'crjmellor', 'rug', 'spree', 'okto

In [39]:
# List of words excluding the words obtained at the previous step
docs_train_2 = [[tkn for tkn in doc if (not tkn in words_to_exclude)] for doc in tqdm(df_train.Tokens)]

  0%|          | 0/21057 [00:00<?, ?it/s]

In [42]:
# Creating a new dataframe to export the results into
# The column token string was added due to problems faced when importing the csv file
df_train_3 = df_train
df_train_3['Tokens'] = docs_train_2
df_train_3['Token string'] = df_train_3['Tokens'].apply(lambda tkn: " ".join(tkn))
df_train_3.head()

Unnamed: 0,label,text,Tokens,Token string
0,0,Batch #5\n\nAppearance: Pours a slightly hazy ...,"[batch, appearance, slightly, hazy, auburn, co...",batch appearance slightly hazy auburn color fi...
1,0,Murky peach color with off-white head. Aroma h...,"[murky, peach, color, white, tart, fruit, kind...",murky peach color white tart fruit kind minera...
2,0,Can poured into a Spiegelau IPA glass\n\nA: Po...,"[spiegelau, ipa, glass, golden, amber, kinda, ...",spiegelau ipa glass golden amber kinda creamy ...
3,0,A big thanks to Jeff for this one. 750ml cappe...,"[big, thank, jeff, cap, bottle, brooklyn, brew...",big thank jeff cap bottle brooklyn brewery sni...
4,0,On tap into a shaker pint.\n\nAppearance is go...,"[tap, shaker, pint, appearance, golden, amber,...",tap shaker pint appearance golden amber lot la...


In [43]:
# Write to a csv file
df_train_3.to_csv('train_processed.csv', index=False)