# Import dependencies and determine working directory

In [36]:
# Import libraries
import os
import pandas as pd
from collections import Counter, defaultdict
import numpy as np
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib

# Get stop words 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Import NLP vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Import models 
from sklearn.linear_model import LogisticRegression
import keras
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.optimizers import SGD

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chriskhoo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# get current directory
dir = os.path.dirname(os.path.abspath('__file__'))

# Define data sets

## Load pre-processed data

In [3]:
# Load df from a csv - all text to lower case, tokenize into list of strings, remove punctuation and lemmatize
preprocessed_path = os.path.join(dir, '02_processed_data','review_text_stars.csv')
preprocessed_df = pd.read_csv(preprocessed_path, index_col = False)
preprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2876509 entries, 0 to 2876508
Data columns (total 2 columns):
stars_review        int64
processed_review    object
dtypes: int64(1), object(1)
memory usage: 43.9+ MB


## Split train and test data

In [4]:
# Create training and test sets using a fixed seed for reproducibility 
X_train, X_test, y_train, y_test = train_test_split(preprocessed_df.processed_review, preprocessed_df.stars_review, test_size = 0.3, random_state = 42)

In [5]:
print(len(X_train))
print(len(X_test))

2013556
862953


# Feature engineering on full dataset

## Update stop words

In [6]:
stopWords = set(stopwords.words('english'))

# Add neutral words related to restaurants to list of stop words
stopWords.update(['restaurant', 'place', 'bar', 'service', 'food', 'lunch', 'breakfast', 'dinner', 'price', 'order', 'ordered'])

# Remove stopwords that might reflect sentiment
stopWords = [word for word in stopWords if word not in ['above', 'not', 'below', 't', 'off', 'no', 'again', 'against', 'under', 'hadn', 'up', 'shan', 'more', 'hasn', 'won','couldn', 'wasn', 'mustn', 'out', 'don','down', 'haven', 'price', 'mightn', 'isn', 'wouldn', 'needn', 'shouldn', 'weren', 'aren', 'didn', 'ain', 'doesn']]

## Vectorize text using unigrams, bigrams and trigrams

In [7]:
# Initialize vectorizer using unigrams,bigrams and trigrams and customized stopwords 
count_vectorizer_full = CountVectorizer(analyzer = 'word',
                             stop_words = stopWords,
                             ngram_range = (1,3),
                             max_df=0.95, 
                             min_df=0.001)

# Transform the training data (independent variables)
count_train_full = count_vectorizer_full.fit_transform(X_train)

# Print the length of features of the count_vectorizer
print( len(count_vectorizer_full.get_feature_names()) )

# Save sparse matrix 
filename_out__count_train = os.path.join(dir, '02_processed_data','count_train.pkl')
joblib.dump(count_train_full, filename_out__count_train) 

8362


In [10]:
# Save count_vectorizer
filename_out__count_vectorizer_full = os.path.join(dir, '02_processed_data','count_vectorizer_full.pkl')
joblib.dump(count_vectorizer_full, filename_out__count_vectorizer_full) 

['/Users/chriskhoo/Documents/SpringBoard/Springboard_Capstone1/02_processed_data/count_vectorizer_full.pkl']

In [15]:
# Transform the test data (independent variables)
count_test_full = count_vectorizer_full.transform(X_test)

# Save sparse matrix 
filename_out__count_test = os.path.join(dir, '02_processed_data','count_test.pkl')
joblib.dump(count_test_full, filename_out__count_test) 

['/Users/chriskhoo/Documents/SpringBoard/Springboard_Capstone1/02_processed_data/count_test.pkl']

In [7]:
# Initialize vectorizer using unigrams,bigrams and trigrams and customized stopwords 
tfidf_vectorizer_full = TfidfVectorizer(analyzer = 'word',
                             stop_words = stopWords,
                             ngram_range = (1,3),
                             max_df=0.95, 
                             min_df=0.001)

# Transform the training data (independent variables)
tfidf_train_full = tfidf_vectorizer_full.fit_transform(X_train)

# Print the length of features of the tfidf_vectorizer
print( len(tfidf_vectorizer_full.get_feature_names()) )

# Save sparse matrix 
filename_out__tfidf_train = os.path.join(dir, '02_processed_data','tfidf_train.pkl')
joblib.dump(tfidf_train_full, filename_out__tfidf_train) 

8362


['/Users/chriskhoo/Documents/SpringBoard/Springboard_Capstone1/02_processed_data/tfidf_train.pkl']

In [8]:
# Save tfidf_vectorizer
filename_out__tfidf_vectorizer_full = os.path.join(dir, '02_processed_data','tfidf_vectorizer_full.pkl')
joblib.dump(tfidf_vectorizer_full, filename_out__tfidf_vectorizer_full) 

# Transform the test data (independent variables)
tfidf_test_full = tfidf_vectorizer_full.transform(X_test)

# Save sparse matrix 
filename_out__tfidf_test = os.path.join(dir, '02_processed_data','tfidf_test.pkl')
joblib.dump(tfidf_test_full, filename_out__tfidf_test) 

['/Users/chriskhoo/Documents/SpringBoard/Springboard_Capstone1/02_processed_data/tfidf_test.pkl']

# Model using full dataset

Using the mini dataset, a variety of models will be trained on a variety of feature sets to identify promising candidates. The promising combinations will then be tuned in the following section and trained on the full training data set. 

It should be noted that to assess model performance, the classification accuracy will be the primary metric. 
A Confusion matrix will be created using the best performing parameters from the cross validation.

## Define logistic regression model

In [9]:
# Define model tuning
def cross_validation_tuning(classifier, param_grid, X_trn, y_trn):
    classifier_cv = GridSearchCV(classifier, param_grid, cv=3)
    classifier_cv.fit(X_trn, y_trn)
    # Print the optimal parameters and best score
    print("Tuned Classifier Parameters: {}".format(classifier_cv.best_params_))
    print("Tuned Classifier Accuracy: {:.3f}".format(classifier_cv.best_score_))
    # Predict the labels
    pred = classifier_cv.predict(X_trn)
    # Compute accuracy
    score = metrics.accuracy_score(y_trn, pred)
    # Calculate and print the confusion matrix
    cm = metrics.confusion_matrix(y_trn, pred, labels=[1,2,3,4,5])
    print('For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.')
    print(cm)
    return classifier_cv

In [10]:
# Define Logistic regression model
def logreg_model(X_trn, y_trn):
    # Create parameters
    param_grid = {'C': [0.0001, 0.01, 1, 100, 10000], 'penalty': ['l1', 'l2']} 
    logreg_classifier = LogisticRegression()
    tuned_logreg_classifier = cross_validation_tuning(logreg_classifier, param_grid, X_trn, y_trn)
    return tuned_logreg_classifier

## Define neural network

In [20]:
# Define neural network architecture
def construct_architecture_1(input_shape):
    dnn_model = Sequential()
    dnn_model.add(Dense(512, activation ='relu', input_shape=input_shape ))
    dnn_model.add(Dropout(0.3))
    dnn_model.add(Dense(512, activation ='relu'))
    dnn_model.add(Dropout(0.3))
    dnn_model.add(Dense(5, activation='softmax'))
    dnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return dnn_model

def construct_architecture_2(input_shape):
    dnn_model = Sequential()
    dnn_model.add(Dense(256, activation ='relu', input_shape=input_shape ))
    dnn_model.add(Dropout(0.3))
    dnn_model.add(Dense(256, activation ='relu'))
    dnn_model.add(Dropout(0.3))
    dnn_model.add(Dense(5, activation='softmax'))
    dnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return dnn_model

def construct_architecture_3(input_shape):
    dnn_model = Sequential()
    dnn_model.add(Dense(512, activation ='relu', input_shape=input_shape ))
    dnn_model.add(Dropout(0.3))
    dnn_model.add(Dense(512, activation ='relu'))
    dnn_model.add(Dropout(0.3))
    dnn_model.add(Dense(512, activation ='relu'))
    dnn_model.add(Dropout(0.2))
    dnn_model.add(Dense(5, activation='softmax'))
    dnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return dnn_model

# Build model
def dnn_model_variable(X_trn, y_trn, architecture):
    n_cols = X_trn.shape[1]
    input_shape =(n_cols, )
    model = architecture(input_shape)
    
    # Define early_stopping_monitor
    early_stopping_monitor = EarlyStopping(monitor='val_acc', patience=2)
    # Define fit
    history = model.fit(X_trn, pd.get_dummies(y_trn), epochs=30, validation_split=0.2, callbacks=[early_stopping_monitor])
    return model, history

## Tune hyper parameters

In [19]:
# Calculate baseline 
# The baseline assumes review is a 5 star rating (the most common class of data). 
length = len(y_train)
correct_pred = len(y_train[y_train == 5])
baseline_accuracy = correct_pred / length 
print(baseline_accuracy)

0.37066364183563805


In [21]:
count_set = {'count': count_train_full} 

In [12]:
 tfidf_set = {'tfidf': tfidf_train_full }

In [14]:
# define test for feature sets
def test_features(model, sets):
    results = defaultdict(float)
    for key, x_values in sets.items():
        print(key)
        model_instance = model(x_values, y_train)
        results[key] = model_instance.best_score_
        print('')
    print('--------------------------')
    print(results)
    return results

In [15]:
# define deep neural net tests for feature sets
def dnn_test_results_variable(sets, architecture):
    results = defaultdict(float)
    for key, x_values in sets.items():
        print(key)
        model_instance, history = dnn_model_variable(x_values, y_train, architecture)
        results[key] = max(history.history['val_acc'])
        print('')
    print('--------------------------')
    print(results)
    return results

In [30]:
# run deep neural nets on bag of words features
dnn_count_results = dnn_test_results_variable(count_set, construct_architecture_1)

count
Train on 1610844 samples, validate on 402712 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30

--------------------------
defaultdict(<class 'float'>, {'count': 0.65203917439699233})


In [37]:
# run deep neural nets on bag of words features
dnn_count_results_2 = dnn_test_results_variable(count_set, construct_architecture_2)

count
Train on 1610844 samples, validate on 402712 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30

--------------------------
defaultdict(<class 'float'>, {'count': 0.65173622837223233})


In [None]:
# run deep neural nets on bag of words features
dnn_count_results_3 = dnn_test_results_variable(count_set, construct_architecture_3)

count
Train on 1610844 samples, validate on 402712 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30

--------------------------
defaultdict(<class 'float'>, {'count': 0.65032579113491318})


In [31]:
# run tuning for logistic regression on bag of words features
logreg_count_results = test_features(logreg_model, count_set) 

count
Tuned Classifier Parameters: {'C': 1, 'penalty': 'l1'}
Tuned Classifier Accuracy: 0.639
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[179779  22872   6896   5494   8328]
 [ 54001  72584  42502  17940   9926]
 [ 13734  26406 117801  99450  28616]
 [  3080   2991  30901 293099 230804]
 [  1854    647   3288  97368 643195]]

--------------------------
defaultdict(<class 'float'>, {'count': 0.6393718376841766})


In [16]:
# run deep neural nets on tfidf features
dnn_tfidf_results_1 = dnn_test_results_variable(tfidf_set, construct_architecture_1)

tfidf
Train on 1610844 samples, validate on 402712 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30

--------------------------
defaultdict(<class 'float'>, {'tfidf': 0.65302995689227039})


In [17]:
# run deep neural nets on tfidf features
dnn_tfidf_results_2 = dnn_test_results_variable(tfidf_set, construct_architecture_2)

tfidf
Train on 1610844 samples, validate on 402712 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30

--------------------------
defaultdict(<class 'float'>, {'tfidf': 0.65541627763761701})


In [18]:
# run deep neural nets on tfidf features
dnn_tfidf_results_3 = dnn_test_results_variable(tfidf_set, construct_architecture_3)

tfidf
Train on 1610844 samples, validate on 402712 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30

--------------------------
defaultdict(<class 'float'>, {'tfidf': 0.65428643795128627})


In [19]:
# run tuning for logistic regression on tfidf features
logreg_tfidf_results = test_features(logreg_model, tfidf_set)

tfidf
Tuned Classifier Parameters: {'C': 1, 'penalty': 'l1'}
Tuned Classifier Accuracy: 0.646
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[179655  26933   7467   4541   4773]
 [ 52441  76430  46032  15717   6333]
 [ 14106  27596 124452  99173  20680]
 [  3705   3392  34226 314541 205011]
 [  2544    851   4115 115232 623610]]

--------------------------
defaultdict(<class 'float'>, {'tfidf': 0.6464766810558038})


## Further tuning  

Based on initial training on the full dataset. It can be observed that the best results have been from DNN's trained on the TFIDF n-grams feature set. The best performing DNN architecture has been the simplest (2 layers, each with 256 nodes) out of the 3. 

For this reason, further tuning will be performed on DNN architectures using the TFIDF n-gram feature set. 

In [23]:
def construct_architecture_4(input_shape):
    dnn_model = Sequential()
    dnn_model.add(Dense(256, activation = 'tanh', input_shape=input_shape ))
    dnn_model.add(Dropout(0.3))
    dnn_model.add(Dense(256, activation = 'tanh'))
    dnn_model.add(Dropout(0.3))
    dnn_model.add(Dense(5, activation='softmax'))
    dnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return dnn_model

In [24]:
def construct_architecture_5(input_shape):
    dnn_model = Sequential()
    dnn_model.add(Dense(256, activation = 'sigmoid', input_shape=input_shape ))
    dnn_model.add(Dropout(0.3))
    dnn_model.add(Dense(256, activation = 'sigmoid'))
    dnn_model.add(Dropout(0.3))
    dnn_model.add(Dense(5, activation='softmax'))
    dnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return dnn_model

In [25]:
def construct_architecture_6(input_shape):
    dnn_model = Sequential()
    dnn_model.add(Dense(256, activation ='relu', input_shape=input_shape ))
    dnn_model.add(Dropout(0.2))
    dnn_model.add(Dense(256, activation ='relu'))
    dnn_model.add(Dropout(0.2))
    dnn_model.add(Dense(5, activation='softmax'))
    dnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return dnn_model

In [26]:
def construct_architecture_7(input_shape):
    dnn_model = Sequential()
    dnn_model.add(Dense(128, activation ='relu', input_shape=input_shape ))
    dnn_model.add(Dropout(0.3))
    dnn_model.add(Dense(128, activation ='relu'))
    dnn_model.add(Dropout(0.3))
    dnn_model.add(Dense(5, activation='softmax'))
    dnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return dnn_model

In [30]:
def construct_architecture_8(input_shape):
    dnn_model = Sequential()
    dnn_model.add(Dense(128, activation ='sigmoid', input_shape=input_shape ))
    dnn_model.add(Dropout(0.3))
    dnn_model.add(Dense(128, activation ='sigmoid'))
    dnn_model.add(Dropout(0.3))
    dnn_model.add(Dense(5, activation='softmax'))
    dnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return dnn_model

In [32]:
def construct_architecture_9(input_shape):
    dnn_model = Sequential()
    dnn_model.add(Dense(128, activation ='sigmoid', input_shape=input_shape ))
    dnn_model.add(Dropout(0.4))
    dnn_model.add(Dense(128, activation ='sigmoid'))
    dnn_model.add(Dropout(0.4))
    dnn_model.add(Dense(5, activation='softmax'))
    dnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return dnn_model

In [33]:
def construct_architecture_10(input_shape):
    dnn_model = Sequential()
    dnn_model.add(Dense(256, activation ='sigmoid', input_shape=input_shape ))
    dnn_model.add(Dropout(0.4))
    dnn_model.add(Dense(256, activation ='sigmoid'))
    dnn_model.add(Dropout(0.4))
    dnn_model.add(Dense(5, activation='softmax'))
    dnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return dnn_model

In [37]:
def construct_architecture_11(input_shape):
    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    dnn_model = Sequential()
    dnn_model.add(Dense(256, activation ='sigmoid', input_shape=input_shape ))
    dnn_model.add(Dropout(0.4))
    dnn_model.add(Dense(256, activation ='sigmoid'))
    dnn_model.add(Dropout(0.4))
    dnn_model.add(Dense(5, activation='softmax'))
    dnn_model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])
    return dnn_model

In [27]:
# run deep neural nets on tfidf features
dnn_tfidf_results_4 = dnn_test_results_variable(tfidf_set, construct_architecture_4)

tfidf
Train on 1610844 samples, validate on 402712 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30

--------------------------
defaultdict(<class 'float'>, {'tfidf': 0.64959325771211585})


In [28]:
# run deep neural nets on tfidf features
dnn_tfidf_results_5 = dnn_test_results_variable(tfidf_set, construct_architecture_5)

tfidf
Train on 1610844 samples, validate on 402712 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30

--------------------------
defaultdict(<class 'float'>, {'tfidf': 0.66003992928007715})


In [29]:
# run deep neural nets on tfidf features
dnn_tfidf_results_6 = dnn_test_results_variable(tfidf_set, construct_architecture_6)

tfidf
Train on 1610844 samples, validate on 402712 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30

--------------------------
defaultdict(<class 'float'>, {'tfidf': 0.65372275968871452})


In [21]:
# run deep neural nets on tfidf features
dnn_tfidf_results_7 = dnn_test_results_variable(tfidf_set, construct_architecture_7)

tfidf
Train on 1610844 samples, validate on 402712 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30

--------------------------
defaultdict(<class 'float'>, {'tfidf': 0.65564472873974056})


In [31]:
# run deep neural nets on tfidf features
dnn_tfidf_results_8 = dnn_test_results_variable(tfidf_set, construct_architecture_8)

tfidf
Train on 1610844 samples, validate on 402712 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30

--------------------------
defaultdict(<class 'float'>, {'tfidf': 0.65998778283298842})


In [34]:
# run deep neural nets on tfidf features
dnn_tfidf_results_9 = dnn_test_results_variable(tfidf_set, construct_architecture_9)

tfidf
Train on 1610844 samples, validate on 402712 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30

--------------------------
defaultdict(<class 'float'>, {'tfidf': 0.65914350702179614})


In [35]:
# run deep neural nets on tfidf features
dnn_tfidf_results_10 = dnn_test_results_variable(tfidf_set, construct_architecture_10)

tfidf
Train on 1610844 samples, validate on 402712 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30

--------------------------
defaultdict(<class 'float'>, {'tfidf': 0.6602658972162776})


In [38]:
# run deep neural nets on tfidf features
dnn_tfidf_results_11 = dnn_test_results_variable(tfidf_set, construct_architecture_11)

tfidf
Train on 1610844 samples, validate on 402712 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

--------------------------
defaultdict(<class 'float'>, {'tfidf': 0.63405858280974203})
