# Import dependencies and determine working directory

In [2]:
# Import libraries
import os
import pandas as pd
from collections import Counter, defaultdict
import numpy as np
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

# Import topic model 
from sklearn.decomposition import NMF, LatentDirichletAllocation

# Get stop words 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Import NLP vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import word2vec
import gensim

# Import models 
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import keras
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.callbacks import EarlyStopping

[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


Using TensorFlow backend.


In [3]:
# get current directory
dir = os.path.dirname(os.path.abspath('__file__'))

# Define data sets

## Load pre-processed data

In [4]:
# Load df from a csv - all text to lower case, tokenize into list of strings, remove punctuation and lemmatize
preprocessed_path = os.path.join(dir, '02_processed_data','review_text_stars.csv')
preprocessed_df = pd.read_csv(preprocessed_path, index_col = False)
preprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2876509 entries, 0 to 2876508
Data columns (total 2 columns):
stars_review        int64
processed_review    object
dtypes: int64(1), object(1)
memory usage: 43.9+ MB


## Split train and test data

In [5]:
# Create training and test sets using a fixed seed for reproducibility 
X_train, X_test, y_train, y_test = train_test_split(preprocessed_df.processed_review, preprocessed_df.stars_review, test_size = 0.3, random_state = 42)

In [6]:
print(len(X_train))
print(len(X_test))

2013556
862953


# Feature engineering on full dataset

## Update stop words

In [7]:
stopWords = set(stopwords.words('english'))

# Add neutral words related to restaurants to list of stop words
stopWords.update(['restaurant', 'place', 'bar', 'service', 'food', 'lunch', 'breakfast', 'dinner', 'price', 'order', 'ordered'])

# Remove stopwords that might reflect sentiment
stopWords = [word for word in stopWords if word not in ['above', 'not', 'below', 't', 'off', 'no', 'again', 'against', 'under', 'hadn', 'up', 'shan', 'more', 'hasn', 'won','couldn', 'wasn', 'mustn', 'out', 'don','down', 'haven', 'price', 'mightn', 'isn', 'wouldn', 'needn', 'shouldn', 'weren', 'aren', 'didn', 'ain', 'doesn']]

## Vectorize text using unigrams, bigrams and trigrams

In [None]:
# Initialize vectorizer using unigrams,bigrams and trigrams and customized stopwords 
count_vectorizer_full = CountVectorizer(analyzer = 'word',
                             stop_words = stopWords,
                             ngram_range = (1,3),
                             max_df=0.95, 
                             min_df=0.001)

# Transform the training data (independent variables)
count_full = count_vectorizer_full.fit_transform(X_train)

# Print the length of features of the count_vectorizer
print( len(count_vectorizer_full.get_feature_names()) )

8362


In [None]:
# Initialize vectorizer using unigrams,bigrams and trigrams and customized stopwords 
tfidf_vectorizer_full = TfidfVectorizer(analyzer = 'word',
                             stop_words = stopWords,
                             ngram_range = (1,3),
                             max_df=0.95, 
                             min_df=0.001)

# Transform the training data (independent variables)
tfidf_full = tfidf_vectorizer_full.fit_transform(X_train)

# Print the length of features of the tfidf_vectorizer
print( len(tfidf_vectorizer_full.get_feature_names()) )

# Model using full dataset

Using the mini dataset, a variety of models will be trained on a variety of feature sets to identify promising candidates. The promising combinations will then be tuned in the following section and trained on the full training data set. 

It should be noted that to assess model performance, the classification accuracy will be the primary metric. 
A Confusion matrix will be created using the best performing parameters from the cross validation.

## Define logistic regression model

In [None]:
# Define model tuning
def cross_validation_tuning(classifier, param_grid, X_trn, y_trn):
    classifier_cv = GridSearchCV(classifier, param_grid, cv=3)
    classifier_cv.fit(X_trn, y_trn)
    # Print the optimal parameters and best score
    print("Tuned Classifier Parameters: {}".format(classifier_cv.best_params_))
    print("Tuned Classifier Accuracy: {:.3f}".format(classifier_cv.best_score_))
    # Predict the labels
    pred = classifier_cv.predict(X_trn)
    # Compute accuracy
    score = metrics.accuracy_score(y_trn, pred)
    # Calculate and print the confusion matrix
    cm = metrics.confusion_matrix(y_trn, pred, labels=[1,2,3,4,5])
    print('For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.')
    print(cm)
    return classifier_cv

In [None]:
# Define Logistic regression model
def logreg_model(X_trn, y_trn):
    # Create parameters
    param_grid = {'C': [0.0001, 0.01, 1, 100, 10000], 'penalty': ['l1', 'l2']} 
    logreg_classifier = LogisticRegression()
    tuned_logreg_classifier = cross_validation_tuning(logreg_classifier, param_grid, X_trn, y_trn)
    return tuned_logreg_classifier

## Define neural network

In [None]:
# Define neural network architecture
def construct_dnn(input_shape):
    dnn_model = Sequential()
    dnn_model.add(Dense(512, activation ='relu', input_shape=input_shape ))
    dnn_model.add(Dropout(0.3))
    dnn_model.add(Dense(512, activation ='relu'))
    dnn_model.add(Dropout(0.3))
#     dnn_model.add(Dense(512, activation ='relu'))
#     dnn_model.add(Dropout(0.2))
    dnn_model.add(Dense(5, activation='softmax'))
    dnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return dnn_model

# Build model
def dnn_model(X_trn, y_trn):
    n_cols = X_trn.shape[1]
    input_shape =(n_cols, )
    model = construct_dnn(input_shape)
    
    # Define early_stopping_monitor
    early_stopping_monitor = EarlyStopping(patience=2)
    # Define fit
    history = model.fit(X_trn, pd.get_dummies(y_trn), epochs=30, validation_split=0.3, callbacks=[early_stopping_monitor])
    return model, history

## Tune hyper parameters

In [None]:
# Calculate baseline 
# The baseline assumes review is a 5 star rating (the most common class of data). 
length = len(y_train)
correct_pred = len(y_train[y_train == 5])
baseline_accuracy = correct_pred / length 
print(baseline_accuracy)

In [None]:
feature_sets = {'count': count_full, 
                'tfidf': tfidf_full}

In [None]:
# define test for feature sets
def test_features(model, sets):
    results = defaultdict(float)
    for key, x_values_mini in sets.items():
        print(key)
        model_instance = model(x_values_mini, y_mini)
        results[key] = model_instance.best_score_
        print('')
    print('--------------------------')
    print(results)
    return results

In [None]:
# define deep neural net tests for feature sets
def dnn_test_results(sets):
    results = defaultdict(float)
    for key, x_values_mini in sets.items():
        print(key)
        model_instance, history = dnn_model(x_values_mini, y_mini)
        results[key] = max(history.history['val_acc'])
        print('')
    print('--------------------------')
    print(results)
    return results

In [None]:
# run tuning for logistic regression 
logreg_mini_results = test_features(logreg_model, feature_sets)

In [None]:
# run deep neural nets
dnn_mini_results = dnn_test_results(feature_sets)