# Import dependencies and determine working directory

In [1]:
# Import libraries
import os
import pandas as pd
from collections import Counter, defaultdict
import numpy as np
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

# Import topic model 
from sklearn.decomposition import NMF, LatentDirichletAllocation

# Get stop words 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Import NLP vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import word2vec

# Import models 
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import keras
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.callbacks import EarlyStopping

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chriskhoo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Using TensorFlow backend.


In [2]:
# get current directory
dir = os.path.dirname(os.path.abspath('__file__'))

# Define data sets

## Load pre-processed data

In [3]:
# Load df from a csv - all text to lower case, tokenize into list of strings, remove punctuation and lemmatize
preprocessed_path = os.path.join(dir, '02_processed_data','review_text_stars.csv')
preprocessed_df = pd.read_csv(preprocessed_path, index_col = False)
preprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2876509 entries, 0 to 2876508
Data columns (total 2 columns):
stars_review        int64
processed_review    object
dtypes: int64(1), object(1)
memory usage: 43.9+ MB


## Split train and test data

In [4]:
# Create training and test sets using a fixed seed for reproducibility 
X_train, X_test, y_train, y_test = train_test_split(preprocessed_df.processed_review, preprocessed_df.stars_review, test_size = 0.3, random_state = 42)

## Create mini dataset

In [5]:
# Create a mini data set for feature and model selection (for manageable training times)
__, X_mini, ___, y_mini = train_test_split(X_train, y_train, test_size = 0.01, random_state = 42)
print(len(X_mini))

20136


# Word 2 Vec using mini dataset

In [6]:
# Create corpus of sentences from mini
sentence_corpus_mini = []
for review in X_mini:
    words = review.split("', '")
    words[0] = words[0][2:]
    words[-1] = words[-1][:-2]
    sentence_corpus_mini.append(' '.join(words))

# Create tokenized corpus
wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in sentence_corpus_mini]

In [7]:
# Create word 2 vec model 
feature_size = 100
w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size, window=5, min_count=10, workers=4)
w2v_dictionary = dict(zip(w2v_model.wv.index2word, w2v_model.wv.syn0))

  after removing the cwd from sys.path.


In [8]:
# Define functions to create a feature array
def average_word_vectors(words, model, vocabulary, num_features):   
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.   
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    return feature_vector
    
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [9]:
w2v_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=w2v_model,
                                             num_features=feature_size)
w2v_mini = pd.DataFrame(w2v_feature_array)
print(w2v_mini.shape)

  


(20136, 100)


In [17]:
w2v_mini_train, w2v_mini_test, y_mini_train, y_mini_test = train_test_split(w2v_mini, y_mini, test_size = 0.3, random_state = 42)
print(len(w2v_mini_train))

14095


# Model selection using mini dataset

Using the mini dataset, a variety of models will be trained on a variety of feature sets to identify promising candidates. The promising combinations will then be tuned in the following section and trained on the full training data set. 

It should be noted that to assess model performance, the classification accuracy will be the primary metric. 
A Confusion matrix will be created using the best performing parameters from the cross validation.

In [10]:
# Define model tuning
def cross_validation_tuning(classifier, param_grid, X_trn, y_trn):
    classifier_cv = GridSearchCV(classifier, param_grid, n_jobs=4, cv=3)
    classifier_cv.fit(X_trn, y_trn)
    # Print the optimal parameters and best score
    print("Tuned Classifier Parameters: {}".format(classifier_cv.best_params_))
    print("Tuned Classifier Accuracy: {:.3f}".format(classifier_cv.best_score_))
    # Predict the labels
    pred = classifier_cv.predict(X_trn)
    # Compute accuracy
    score = metrics.accuracy_score(y_trn, pred)
    # Calculate and print the confusion matrix
    cm = metrics.confusion_matrix(y_trn, pred, labels=[1,2,3,4,5])
    print('For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.')
    print(cm)
    return classifier_cv

## Define models

In [26]:
logreg_model_1 = LogisticRegression(C= 1, penalty= 'l1')
logreg_model_1.fit(w2v_mini_train, y_mini_train)
w2v_logreg_pred = logreg_model_1.predict(w2v_mini_test)
w2v_logreg_score = metrics.accuracy_score(y_mini_test, w2v_logreg_pred)
w2v_logreg_cm = metrics.confusion_matrix(y_mini_test, w2v_logreg_pred, labels=[1,2,3,4,5])
print(w2v_logreg_score)
print('For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.')
print(w2v_logreg_cm)

0.541135573581
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 443   46   41   82   65]
 [ 190   68  118  164   77]
 [ 101   50  194  412  126]
 [  47   19   79  830  691]
 [  46    7   19  392 1734]]


In [29]:
sgd_model_1 = SGDClassifier(random_state= 42, n_jobs=4, max_iter=4, l1_ratio= 0.3, penalty ='elasticnet')
sgd_model_1.fit(w2v_mini_train, y_mini_train)
w2v_sgd_pred = sgd_model_1.predict(w2v_mini_test)
w2v_sgd_score = metrics.accuracy_score(y_mini_test, w2v_sgd_pred)
w2v_sgd_cm = metrics.confusion_matrix(y_mini_test, w2v_sgd_pred, labels=[1,2,3,4,5])
print(w2v_sgd_score)
print('For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.')
print(w2v_sgd_cm)

0.429233570601
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[404 140  82  38  13]
 [178 175 171  86   7]
 [ 87 197 283 296  20]
 [ 71 175 299 951 170]
 [102 179 179 958 780]]


## Test model 

In [37]:
logreg_param_grid = {'C': [0.0001, 1, 100], 'penalty': ['l1', 'l2']}
logreg_model_1 = LogisticRegression()
logreg_w2v_cv = GridSearchCV(logreg_model_1, logreg_param_grid, cv=3)
logreg_w2v_cv.fit(w2v_mini_train, y_mini_train)
print("Tuned Classifier Parameters: {}".format(logreg_w2v_cv.best_params_))
print("Tuned Classifier Accuracy: {:.3f}".format(logreg_w2v_cv.best_score_))
logreg_w2v_pred = logreg_w2v_cv.predict(w2v_mini_test)
logreg_w2v_score = metrics.accuracy_score(y_mini_test, logreg_w2v_pred)
logreg_w2v_cm = metrics.confusion_matrix(y_mini_test, logreg_w2v_pred, labels=[1,2,3,4,5])
print(logreg_w2v_score)
print('For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.')
print(logreg_w2v_cm)

Tuned Classifier Parameters: {'C': 100, 'penalty': 'l1'}
Tuned Classifier Accuracy: 0.552
0.540804502566
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 436   56   44   71   70]
 [ 190   83  112  157   75]
 [  90   62  205  402  124]
 [  50   21   87  814  694]
 [  44    8   21  396 1729]]


In [36]:
sgd_param_grid = {"penalty": ['l1', 'l2', 'elasticnet'],
                  "l1_ratio": [0.1, 0.3, 0.5] }
sgd_model_1 = SGDClassifier(random_state= 42, max_iter=4)
sgd_w2v_cv = GridSearchCV(sgd_model_1, sgd_param_grid, cv=3)
sgd_w2v_cv.fit(w2v_mini_train, y_mini_train)
print("Tuned Classifier Parameters: {}".format(sgd_w2v_cv.best_params_))
print("Tuned Classifier Accuracy: {:.3f}".format(sgd_w2v_cv.best_score_))
sgd_w2v_pred = sgd_w2v_cv.predict(w2v_mini_test)
sgd_w2v_score = metrics.accuracy_score(y_mini_test, sgd_w2v_pred)
sgd_w2v_cm = metrics.confusion_matrix(y_mini_test, sgd_w2v_pred, labels=[1,2,3,4,5])
print(sgd_w2v_score)
print('For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.')
print(sgd_w2v_cm)

Tuned Classifier Parameters: {'l1_ratio': 0.1, 'penalty': 'l1'}
Tuned Classifier Accuracy: 0.457
0.429564641616
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[322 211  87  45  12]
 [124 253 130 106   4]
 [ 45 256 224 341  17]
 [ 33 224 269 973 167]
 [ 42 189 188 956 823]]
