In [1]:
# Import the relevant libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

import spacy
import gensim.downloader as api
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from collections import Counter

from tqdm.notebook import tqdm

2023-07-20 17:39:24.435136: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Introduction (feel free to change the title)

The data pulled from the file titled *train_processed.csv* contains 4 columns - namely the *label* of each review, the review itself named *text*, *Tokens* obtained from the preprocessing of the text, and *Token string* which concatenates the tokens to form a sentence. The tokens only contain alphabetical words but do not contain stop words. Additionally, words that occur more than 7730 times and those that occur fewer than 6 times were removed from the list in an effort to reduce the complexity of the data to be handled by the methods below. 

The NLP methods implemented here are TFIDF, Doc2Vec, and Word2Vec, along with a combination of the three. The machine learning models used for classification are Naive Bayes, Logistic Regression, and Support Vector Classifier

In [2]:
# Read the csv with the processed data
df_train = pd.read_csv('train_processed.csv')
df_train.head()

Unnamed: 0,label,text,Tokens,Token string
0,0,Batch #5\n\nAppearance: Pours a slightly hazy ...,"['batch', 'appearance', 'slightly', 'hazy', 'a...",batch appearance slightly hazy auburn color fi...
1,0,Murky peach color with off-white head. Aroma h...,"['murky', 'peach', 'color', 'white', 'tart', '...",murky peach color white tart fruit kind minera...
2,0,Can poured into a Spiegelau IPA glass\n\nA: Po...,"['spiegelau', 'ipa', 'glass', 'golden', 'amber...",spiegelau ipa glass golden amber kinda creamy ...
3,0,A big thanks to Jeff for this one. 750ml cappe...,"['big', 'thank', 'jeff', 'cap', 'bottle', 'bro...",big thank jeff cap bottle brooklyn brewery sni...
4,0,On tap into a shaker pint.\n\nAppearance is go...,"['tap', 'shaker', 'pint', 'appearance', 'golde...",tap shaker pint appearance golden amber lot la...


After the preprocessing, we can see that 37 reviews did not meet the criteria laid out above and were hence, removed from the list

In [4]:
# Info of the processed data
print(df_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21057 entries, 0 to 21056
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   label         21057 non-null  int64 
 1   text          21057 non-null  object
 2   Tokens        21057 non-null  object
 3   Token string  21020 non-null  object
dtypes: int64(1), object(3)
memory usage: 658.2+ KB
None


In [5]:
# Removing rows with NA values
df_train.dropna(ignore_index=True, inplace=True)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21020 entries, 0 to 21019
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   label         21020 non-null  int64 
 1   text          21020 non-null  object
 2   Tokens        21020 non-null  object
 3   Token string  21020 non-null  object
dtypes: int64(1), object(3)
memory usage: 657.0+ KB


In [5]:
nlp = spacy.load("en_core_web_lg")

### For Cheng
The 2 code cells below are to obtain the typos/words that aren't recognised by spacy. The counts of these unique words can be found in the files called *train_typos.csv* and *test_typos.csv*

In [None]:
# Count of typos in the train dataset
typos_train = [[tkn.text for tkn in nlp(doc) if (tkn.is_oov) & (tkn.is_alpha)] for doc in tqdm(df_train.text)]

train_typos_list = []
for lst in typos_train:
    for tkn in lst:
        train_typos_list.append(tkn)

print(len(train_typos_list))

train_typo_count = Counter(train_typos_list)
print(train_typo_count)

df_train_typos = pd.DataFrame.from_dict(data=train_typo_count, orient='index', columns=['Count'])
df_train_typos.to_csv('train_typos.csv')

  0%|          | 0/21057 [00:00<?, ?it/s]

In [None]:
# Count of typos in the test dataset
typos_test = [[tkn.text for tkn in nlp(doc) if (tkn.is_oov) & (tkn.is_alpha)] for doc in tqdm(df_test.text)]

test_typos_list = []
for lst in typos_test:
    for tkn in lst:
        test_typos_list.append(tkn)

print(len(test_typos_list))

test_typo_count = Counter(test_typos_list)
print(test_typo_count)

df_test_typos = pd.DataFrame.from_dict(data=test_typo_count, orient='index', columns=['Count'])
df_test_typos.to_csv('test_typos.csv')

  0%|          | 0/8943 [00:00<?, ?it/s]

In [3]:
# Displaying list of pre-trained language models that can be used for the word2vec algorithm
api.info()['models'].keys()

dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])

**Note**: Not sure if this should come in the conclusion section. I'll leave that up to you

The *word2vec-google-news-300* model was used as it is very comprehensive, however, we could look into using models from glove or fasttext in order to compare the results 

In [6]:
# loading the appropriate pre-trained language model
wv = api.load('word2vec-google-news-300')

With the Word2Vec approach, we calculated the average value of the vector across a review

In [8]:
# Function to calculate average value of document vector
def word_vectoriser(sent):
    vector_size = wv.vector_size
    wv_res = np.zeros(vector_size) # initialise vector

    counter = 1
    for word in sent:
        if word in wv:
            counter += 1 # increase counter
            wv_res += wv[word] # sum of all word vectors within a review

    wv_res = wv_res/counter # compute average value of vector
    return wv_res # return average value of vector

In [9]:
# Function to split data into train and validation data
def split_data(X, y):
    # applying train_test_split with the following parameters
    # stratify is used to ensure all labels have the same density as the original dataset
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    return X_train, X_valid, y_train, y_valid # return split data

In [10]:
# Function to run the naive bayes classifier
def naive_bayes(X_train, X_valid, y_train, y_valid): 
    nb = MultinomialNB() # mulinomial naive bayes model is initialised
    nb.fit(X_train, y_train) # data is fit to the model
    y_pred_nb = nb.predict(X_valid) # prediction is made using validation data
    print(f'Accuracy of Naive Bayes: {accuracy_score(y_valid, y_pred_nb)}') # accuracy of the model in classifying
    print(f'Classification report:\n{classification_report(y_valid, y_pred_nb)}') # summary of the classification

In [11]:
# Function to run the support vector classifier
def svc(X_train, X_valid, y_train, y_valid):
    svc = SVC(random_state=42) # initialised with random state of 42
    svc.fit(X_train, y_train) # data is fit to the model
    y_pred_svc = svc.predict(X_valid) # prediction is made using validation data
    print(f'Accuracy of SVC: {accuracy_score(y_valid, y_pred_svc)}') # accuracy of the model in classifying
    print(f'Classification report:\n{classification_report(y_valid, y_pred_svc)}') # summary of the classification

In [12]:
# Function to run logistic regression
def lr(X_train, X_valid, y_train, y_valid):
    lr = LogisticRegression(random_state=42, solver='newton-cg') # initialised with random state of 42 and choice of solver
    lr.fit(X_train, y_train) # data is fit to the model
    y_pred_lr = lr.predict(X_valid) # prediction is made using validation data
    print(f'Accuracy of Logistic Regression: {accuracy_score(y_valid, y_pred_lr)}') # accuracy of the model in classifying
    print(f'Classification report:\n{classification_report(y_valid, y_pred_lr)}') # summary of the classification

## Results

### TFIDF

The TFIDF vectoriser provided results of 58% accuracy by the Naive Bayes classifier, 60.27% accuracy by the Support Vector Classifier, and 59.77% accuracy by the Logistic Regression classifier. One notable observation is that the first 2 classifiers are not able to accurately classify label 1 accurately enough but is able to do a much better job with labels 0 and 2.

In [13]:
vectorizer = TfidfVectorizer() # initialise the vectoriser
tfidf_X_train, tfidf_X_valid, y_train, y_valid = split_data(df_train['Token string'], df_train.label) # split the data
tfidf_train = vectorizer.fit_transform(tfidf_X_train) # fit the vectoriser to the train data
tfidf_valid = vectorizer.transform(tfidf_X_valid) # vectorise the validation data using the transform method

# running the various classifiers
naive_bayes(tfidf_train, tfidf_valid, y_train, y_valid)
svc(tfidf_train, tfidf_valid, y_train, y_valid)
lr(tfidf_train, tfidf_valid, y_train, y_valid)

Accuracy of Naive Bayes: 0.5801617507136061
Classification report:
              precision    recall  f1-score   support

           0       0.61      0.67      0.64      1407
           1       0.47      0.41      0.44      1393
           2       0.64      0.66      0.65      1404

    accuracy                           0.58      4204
   macro avg       0.57      0.58      0.58      4204
weighted avg       0.57      0.58      0.58      4204

Accuracy of SVC: 0.6027592768791628
Classification report:
              precision    recall  f1-score   support

           0       0.64      0.68      0.66      1407
           1       0.49      0.43      0.46      1393
           2       0.66      0.70      0.68      1404

    accuracy                           0.60      4204
   macro avg       0.60      0.60      0.60      4204
weighted avg       0.60      0.60      0.60      4204

Accuracy of Logistic Regression: 0.5977640342530923
Classification report:
              precision    recall  f1

### Doc2Vec

The Doc2Vec approach requires us to build the vocabulary out of the tagged documents which are obtained from the training portion of the split dataset. We then infer the vectors for the train and validation datasets. The results are poorer compared to the previous approach with a 57.78% accuracy by the Support Vector Classifier, and 57.4% accuracy by the Logistic Regression Classifier. The Naive Bayes classifier cannot be run due to the vectors containing negative values which are not accepted by this classifier as it assumes a multinomial distribution

In [14]:
tokens = [tkn.split() for tkn in df_train['Token string']] # splitting the token string to get the list of words in each review
doc2vec_X_train, doc2vec_X_valid, y_train, y_valid = split_data(tokens, df_train.label) # split the data

# the section below is for the doc2vec algorithm
X_tagged_docs_train = [TaggedDocument(doc, [i]) for i, doc in enumerate(doc2vec_X_train)] # create a list of tagged documents
model = Doc2Vec(X_tagged_docs_train, vector_size=100, window=2, min_count=10, workers=4, epochs=30) # initialising the doc2vec model
model.build_vocab(X_tagged_docs_train) # building the vocabulary based on the train data
model.train(X_tagged_docs_train, total_examples=model.corpus_count, epochs=model.epochs) # training the model
doc2vec_X_train = [model.infer_vector(doc.words) for doc in tqdm(X_tagged_docs_train)] # inferring vectors in the train dataset

X_tagged_docs_valid = [TaggedDocument(doc, [i]) for i, doc in enumerate(doc2vec_X_valid)] # create a list of tagged documents
# inferring the vectors for the validation based on the model that was trained on the train dataset
doc2vec_X_valid = [model.infer_vector(doc.words) for doc in tqdm(X_tagged_docs_valid)]

  0%|          | 0/16816 [00:00<?, ?it/s]

  0%|          | 0/4204 [00:00<?, ?it/s]

In [15]:
# running the various classifiers
svc(doc2vec_X_train, doc2vec_X_valid, y_train, y_valid)
lr(doc2vec_X_train, doc2vec_X_valid, y_train, y_valid)

Accuracy of SVC: 0.5777830637488106
Classification report:
              precision    recall  f1-score   support

           0       0.62      0.64      0.63      1407
           1       0.47      0.37      0.42      1393
           2       0.61      0.72      0.66      1404

    accuracy                           0.58      4204
   macro avg       0.57      0.58      0.57      4204
weighted avg       0.57      0.58      0.57      4204

Accuracy of Logistic Regression: 0.5739771646051379
Classification report:
              precision    recall  f1-score   support

           0       0.61      0.66      0.63      1407
           1       0.47      0.29      0.36      1393
           2       0.59      0.78      0.67      1404

    accuracy                           0.57      4204
   macro avg       0.56      0.57      0.55      4204
weighted avg       0.56      0.57      0.55      4204



### Combining Doc2Vec and TFIDF

Here, we combine the output vectors obtained from the above 2 approaches and stack them in the hopes of obtaining an improvement in the accuracy of classification

We notice that the results don't improve considerably with the accuracy of the Support Vector Classifier being 58.25% and the accuracy of the Logistics Regression Classifier being 60.1%. This is probably because the vectors obtained don't contain additional information that is helpful for the classification. Notice again that both classifiers are not able to accurately classify label 1 as it is for labels 0 and 2.

In [16]:
from scipy.sparse import hstack
# merging the train vectors obtained from the doc2vec and TFIDF algorithms
merged_features_train = hstack((tfidf_train, doc2vec_X_train))

# merging the validation vectors obtained from the doc2vec and TFIDF algorithms
merged_features_valid = hstack((tfidf_valid, doc2vec_X_valid))

In [17]:
# running the various classifiers
svc(merged_features_train, merged_features_valid, y_train, y_valid)
lr(merged_features_train, merged_features_valid, y_train, y_valid)

Accuracy of SVC: 0.5825404376784015
Classification report:
              precision    recall  f1-score   support

           0       0.63      0.65      0.64      1407
           1       0.47      0.38      0.42      1393
           2       0.62      0.72      0.67      1404

    accuracy                           0.58      4204
   macro avg       0.57      0.58      0.57      4204
weighted avg       0.57      0.58      0.57      4204

Accuracy of Logistic Regression: 0.6008563273073264
Classification report:
              precision    recall  f1-score   support

           0       0.64      0.69      0.66      1407
           1       0.49      0.39      0.43      1393
           2       0.65      0.71      0.68      1404

    accuracy                           0.60      4204
   macro avg       0.59      0.60      0.59      4204
weighted avg       0.59      0.60      0.59      4204



### Combining Doc2Vec, TFIDF, and Word2Vec

Here, we combine the output vectors obtained from all 3 approaches and compare the results. We notice once again that the results haven't improved indicating that no new information is being contained in the vectors generated to be helpful for classification. The accuracy of the Support Vector Classifier is at 58% and that for the Logistics Regression Classifier is at 60.1%. Notice once again that both classifiers are not able to accurately classify label 1 as it is for labels 0 and 2.

In [30]:
# splitting the data for the word2vec approach
word2vec_X_train, word2vec_X_valid, y_train, y_valid = split_data(tokens, df_train.label)
word2vec_train = []
for doc_tkn in word2vec_X_train:
    try:
        word2vec_train.append(word_vectoriser(doc_tkn)) # calling the word_vectoriser function to vectorise each review
    except:
        continue

word2vec_valid = []
for doc_tkn in word2vec_X_valid:
    try:
        word2vec_valid.append(word_vectoriser(doc_tkn)) # calling the word_vectoriser function to vectorise each review
    except:
        continue

# combining the output vectors from all three approaches
combined_embeddings_train = hstack((word2vec_train, merged_features_train))
combined_embeddings_valid = hstack((word2vec_valid, merged_features_valid))

In [31]:
# running the various classifiers
svc(combined_embeddings_train, combined_embeddings_valid, y_train, y_valid)
lr(combined_embeddings_train, combined_embeddings_valid, y_train, y_valid)

Accuracy of SVC: 0.5799238820171265
Classification report:
              precision    recall  f1-score   support

           0       0.61      0.66      0.63      1407
           1       0.46      0.36      0.41      1393
           2       0.63      0.72      0.67      1404

    accuracy                           0.58      4204
   macro avg       0.57      0.58      0.57      4204
weighted avg       0.57      0.58      0.57      4204

Accuracy of Logistic Regression: 0.6008563273073264
Classification report:
              precision    recall  f1-score   support

           0       0.63      0.69      0.66      1407
           1       0.49      0.39      0.43      1393
           2       0.66      0.72      0.69      1404

    accuracy                           0.60      4204
   macro avg       0.59      0.60      0.59      4204
weighted avg       0.59      0.60      0.59      4204

