In [20]:
##### Global Variables ######
create_new_vectors = False
data_size = 1500

In [2]:
###### Import Libraries ######
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score

!python -m spacy download en_core_web_md
import spacy
import en_core_web_md
nlp = en_core_web_md.load()

import pickle

from tensorflow import keras 
from scipy import sparse

import os

print("Imports complete")

Collecting en_core_web_md==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4MB)
[K     |████████████████████████████████| 96.4MB 1.2MB/s 
Building wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.2.5-cp37-none-any.whl size=98051304 sha256=5252ac2a016a8b671ae3063b5e4d28bb15fa188791515d04d65a26d8ad9621cc
  Stored in directory: /tmp/pip-ephem-wheel-cache-dp0d_mrt/wheels/df/94/ad/f5cf59224cea6b5686ac4fd1ad19c8a07bc026e13c36502d81
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')
Imports complete


In [3]:
####### Data Preprocessing Functions #######
def is_not_br(token):
    return ((not ('<br' in token.text)) & (not ('/><br' in token.text)) & (not ('/>' in token.text)) & (token.text != 'br') & (not ('<' in token.text)))
def tokenize(text):
    clean_tokens = []
    for token in nlp(text):
        if (not token.is_stop) & (token.lemma_ != '-PRON-') & (not token.is_punct) & is_not_br(token) & (not token.text.isdigit()):
            clean_tokens.append(token.lemma_.lower())
    return clean_tokens
# Maps 'positive' to 1 and 'negative' to 0
def convert_y(sentiments):
    converted_list = []
    for sentiment in sentiments:
        converted_list.append(1 if sentiment == 'positive' else 0)
    return np.array(converted_list)
# Normalization function before neural network: normalizes all values between 0 and 1
def normalize(arr):
    max_val = np.max(arr)
    min_val = np.min(arr)
    # normalized_array = np.array([2 * ((val - min_val)/(max_val - min_val)) for val in arr])
    normalized_array = 2 * ((arr - min_val)/(max_val - min_val)) - 1
    return normalized_array

#%%
###### Loading Data ######
!gdown --id 1EQCiEAMrmGZe4eGH990Lf5oyR7HK-z3x
imdb = pd.read_csv('imdbDataset.csv')
imdb = imdb.iloc[:data_size]
pd.set_option("display.max_colwidth", None)
print(f"Length : {len(imdb)}")

print("Data Loading Complete")

#%%
##### Word -> Number Encoding ######
##### Bag Of Words
##### Hyperparams: max_features
def bag_of_words(X, y):

    def load_model():
        print("Loading vectorizer...")
        return pickle.load(open("/content/preencoded_embeddings/tfidf_vectors.pickle", "rb"))
    def create_model(X_data):
        print("Creating vectorizer...")
        bow_transformer = CountVectorizer(analyzer=tokenize, max_features=2000).fit(X_data)
        X_data = bow_transformer.transform(X_data)
        os.mkdir("/content/preencoded_embeddings") if not os.isdir("/content/preencoded_embeddings")
        pickle.dump(X_data, open("/content/preencoded_embeddings/bow_transformer_vectors.pickle", "wb"))
        return X_data

    print("Starting Bag of Words Model")
    if create_new_vectors:
        if os.path.isfile("/content/preencoded_embeddings/bow_transformer_vectors.pickle"):
            user_input = input("Found vectorizer. Are you sure you still want to make a new vectorizer? (Y/N)")
            if user_input == "N":
                X = load_model()
            elif user_input == "Y":
                os.remove("/content/preencoded_embeddings/word2vec_reviews.pickle")
                create_model(X)
            else:
                print("Input not in format specified")
        else:
            print("Did not find vectorizer.")
            X = create_model(X)
    else:
        print("Vectorizer Found.")
        X = load_model()
    
    y = convert_y(y)
    print("Bag of Words Model Completed")
    return sparse.lil_matrix(X).toarray(), y

##### TF-IDF Vectorization
##### Hyperparams: max_features
def tf_idf(X, y):

    def load_model():
        print("Loading vectorizer...")
        return pickle.load(open("/content/preencoded_embeddings/tfidf_vectors.pickle", "rb"))
    def create_model(X_data):
        print("Creating vectorizer...")
        tfidf_transformer = TfidfVectorizer(analyzer=tokenize, max_features=2000).fit(X_data)
        X_data = tfidf_transformer.transform(X_data)
        os.mkdir("/content/preencoded_embeddings") if not os.isdir("/content/preencoded_embeddings")
        pickle.dump(X_data, open("/content/preencoded_embeddings/tfidf_vectors.pickle", "wb"))
        return X_data

    print("Starting TF-IDF Model")
    if create_new_vectors:
        if os.path.isfile("/content/preencoded_embeddings/bow_transformer_vectors.pickle"):
            user_input = input("Found vectorizer. Are you sure you still want to make a new vectorizer? (Y/N)")
            if user_input == "N":
                X = load_model()
            elif user_input == "Y":
                os.remove("/content/preencoded_embeddings/word2vec_reviews.pickle")
                create_model(X)
            else:
                print("Input not in format specified")
        else:
            print("Did not find vectorizer.")
            X = create_model(X)
    else:
        print("Vectorizer Found.")
        X = load_model()

    y = convert_y(y)
    print("TF-IDF Model Created")
    return sparse.lil_matrix(X).toarray(), y

##### Pre-trained Word Embeddings (Word2Vec Twitter Model)
##### Hyperparams: 
def word2vec(X, y):

    def load_model():
        print("Loading word vectors...")
        return pickle.load(open("/content/preencoded_embeddings/word2vec_reviews.pickle", "rb"))
    def create_model(X):
        vectorized_reviews = []
        for index, review in enumerate(X):
            avg_vector = np.zeros(shape=(300))
            for token in tokenize(review):
                avg_vector += nlp(token).vector
            avg_vector = avg_vector / len(X)
            vectorized_reviews.append(avg_vector)
            print(f"Review {index}: Done")
        os.mkdir("/content/preencoded_embeddings") if not os.isdir("/content/preencoded_embeddings")
        pickle.dump(vectorized_reviews, open("/content/preencoded_embeddings/word2vec_reviews.pickle", "wb"))
        return vectorized_reviews

    print("Starting Word2Vec Model")
    if create_new_vectors:
        if os.path.isfile("/content/preencoded_embeddings/word2vec_reviews.pickle"):
            user_input = input("Found vectorizer. Are you sure you still want to make a new vectorizer? (Y/N)")
            if user_input == "N":
                X = load_model()
            elif user_input == "Y":
                os.remove("/content/preencoded_embeddings/word2vec_reviews.pickle")
                create_model(X)
            else:
                print("Input not in format specified")
        else:
            print("Did not find vectorizer")
            X = create_model(X)
    else:
        X = load_model()
    y = convert_y(y)    
    print("Word2Vec Model Created")
    return normalize(np.array(X)), y

##### BERT Language Model

##### Model Types ######
##### Logistic Regression
##### Hyperparams: train_test_split, regularization_type, C (inverse of regularization strength), 
def logistic_regression(X, y):
    print("Starting creation of Logistic Regression Model")
    logistic_model = LogisticRegression()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
    logistic_model.fit(X_train, y_train)
    y_pred = logistic_model.predict(X_test)
    print("Logistic Regression Model created")
    return y_test, y_pred

##### K-Nearest-Neighbours Classifier
##### Hyperparams: n_neighbours, train_test_split
def knn_classifier(X, y, number_neighbors):
    print("Started creation of KNN Classifier")
    knn_classifier = KNeighborsClassifier(n_neighbors=number_neighbors)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)
    knn_classifier.fit(X_train, y_train)
    y_pred = knn_classifier.predict(X_test)
    print("KNN Classifier created")
    return y_test, y_pred

##### Naive Bayes Classifier
##### Hyperparams: train_test_split
def naive_bayes(X, y):
    print("Started creation of Naive Bayes Classifier")
    nb_model = MultinomialNB()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)
    nb_model.fit(X_train, y_train)
    y_pred = nb_model.predict(X_test)
    print("Naive Bayes Classifier created")
    return y_test, y_pred

##### Neural Network
##### Hyperparams: train_test_split, Architecture, Optimizer: Learning Rate, Epochs, Batch Size, Initial Weights, Initial Biases
#####              Epochs, Loss Function, Regularization
def neural_network(X, y, architecture_id):
    print("Started creation of Neural Network")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

    if architecture_id == 1:
        model = keras.Sequential([
                keras.layers.Dense(300, activation="tanh", kernel_initializer=keras.initializers.RandomUniform(minval=-1, maxval=1), bias_initializer=keras.initializers.TruncatedNormal(mean=0, stddev=0.5)),
                keras.layers.Dense(150, activation="sigmoid", kernel_initializer=keras.initializers.RandomUniform(minval=-1, maxval=1), bias_initializer=keras.initializers.TruncatedNormal(mean=0, stddev=0.5)),
                keras.layers.Dense(1, activation="sigmoid")
        ])
        model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
        model.fit(X_train, y_train, epochs=50, batch_size=64)
    elif architecture_id == 2:
        model = keras.Sequential([
            keras.layers.Dense(2000, activation="relu"),
            keras.layers.Dense(200, activation="relu"),
            keras.layers.Dense(1, activation="sigmoid")
        ])
        model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
        model.fit(X_train, y_train, epochs=5, batch_size=64)
    
    model.summary()
    y_pred = model.predict(X_test)
    argmax_predictions = np.array([round(array[0]) for array in y_pred])
    print("Neural Network Created")
    return y_test, argmax_predictions

Downloading...
From: https://drive.google.com/uc?id=1EQCiEAMrmGZe4eGH990Lf5oyR7HK-z3x
To: /content/imdbDataset.csv
66.2MB [00:00, 142MB/s] 
Length : 1500
Data Loading Complete


In [24]:
##### Applying Models And Printing Accuracy #####

X, y = word2vec(imdb['review'], imdb['sentiment'])
y_test, y_pred = neural_network(X, y, 1)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Accuracy: ", round(accuracy, 2), ", Precision: ", round(precision, 2), ", Recall: ", round(recall, 2))

##### KNN Classifier Implementation
# max_accuracy = -1
# max_accuracy_neighbors = 0

# for i in range(1, 10):
#     print(f"Number of neighbors: {i}")
#     y_test, y_pred = knn_classifier(X, y, i)

#     accuracy = accuracy_score(y_test, y_pred)
#     precision = precision_score(y_test, y_pred)
#     recall = recall_score(y_test, y_pred)

#     print("Accuracy: ", round(accuracy, 2), ", Precision: ", round(precision, 2), ", Recall ", round(recall, 2))
#     print("\n")

#     if (accuracy > max_accuracy):
#         max_accuracy = accuracy
#         max_accuracy_neighbors = i

# print(f"Maximum Accuracy obtained was {max_accuracy} with {max_accuracy_neighbors}")

Starting Word2Vec Model
Loading word vectors...
Word2Vec Model Created
Started creation of Neural Network
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_27 (Dense)             (None, 300)               90300     
_________________________________________________________________
dense_28 (Dense) 