# Import dependencies and determine working directory

In [7]:
# Import libraries
import os
import pandas as pd
from collections import Counter, defaultdict
import numpy as np
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

# Import topic model 
from sklearn.decomposition import NMF, LatentDirichletAllocation

# Get stop words 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Import NLP vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import word2vec
import gensim

# Import models 
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import keras
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.callbacks import EarlyStopping

[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


In [2]:
# get current directory
dir = os.path.dirname(os.path.abspath('__file__'))

# Define data sets

## Load pre-processed data

In [3]:
# Load df from a csv - all text to lower case, tokenize into list of strings, remove punctuation and lemmatize
preprocessed_path = os.path.join(dir, '02_processed_data','review_text_stars.csv')
preprocessed_df = pd.read_csv(preprocessed_path, index_col = False)
preprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2876509 entries, 0 to 2876508
Data columns (total 2 columns):
stars_review        int64
processed_review    object
dtypes: int64(1), object(1)
memory usage: 43.9+ MB


## Split train and test data

In [4]:
# Create training and test sets using a fixed seed for reproducibility 
X_train, X_test, y_train, y_test = train_test_split(preprocessed_df.processed_review, preprocessed_df.stars_review, test_size = 0.3, random_state = 42)

## Create mini dataset

In [5]:
# Create a mini data set for feature and model selection (for manageable training times)
__, X_mini, ___, y_mini = train_test_split(X_train, y_train, test_size = 0.01, random_state = 42)
print(len(X_mini))

20136


# Feature selection using mini dataset
Using the mini dataset, various types of feature engineering will be performed and tested on a variety of models in the next stage. 

## Count vectorizer

In [6]:
# Initialize vectorizer using unigrams and remove all standard stopwords 
# Source: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
count_vectorizer_mini = CountVectorizer(analyzer = 'word',
                             stop_words = 'english',
                             max_df=0.95, 
                             min_df=0.001)

# Transform the training data (independent variables)
count_mini = count_vectorizer_mini.fit_transform(X_mini)

# Print the length of features of the count_vectorizer
print( len(count_vectorizer_mini.get_feature_names()) )

3828


## Tfidf vectorizer (weighted vectorizer)

In [7]:
# Initialize vectorizer using unigrams and remove all standard stopwords 
tfidf_vectorizer_mini = TfidfVectorizer(analyzer = 'word',
                             stop_words = 'english',
                             max_df=0.95, 
                             min_df=0.001)

# Transform the training data (independent variables)
tfidf_mini = tfidf_vectorizer_mini.fit_transform(X_mini)

# Print the length of features of the tfidf_vectorizer
print( len(tfidf_vectorizer_mini.get_feature_names()) )

3828


## Update stop words

In [8]:
stopWords = set(stopwords.words('english'))

# Add neutral words related to restaurants to list of stop words
stopWords.update(['restaurant', 'place', 'bar', 'service', 'food', 'lunch', 'breakfast', 'dinner', 'price', 'order', 'ordered'])

# Remove stopwords that might reflect sentiment
stopWords = [word for word in stopWords if word not in ['above', 'not', 'below', 't', 'off', 'no', 'again', 'against', 'under', 'hadn', 'up', 'shan', 'more', 'hasn', 'won','couldn', 'wasn', 'mustn', 'out', 'don','down', 'haven', 'price', 'mightn', 'isn', 'wouldn', 'needn', 'shouldn', 'weren', 'aren', 'didn', 'ain', 'doesn']]

## Feature set with new stop words

In [9]:
# Initialize vectorizer using unigrams and customized stopwords 
count_vectorizer_mini__stop = CountVectorizer(analyzer = 'word',
                             stop_words = stopWords,
                             max_df=0.95, 
                             min_df=0.001)

# Transform the training data (independent variables)
count_mini__stop = count_vectorizer_mini__stop.fit_transform(X_mini)

# Print the length of features of the count_vectorizer
print( len(count_vectorizer_mini__stop.get_feature_names()) )

3988


In [10]:
# Initialize vectorizer using unigrams and customized stopwords 
tfidf_vectorizer_mini__stop = TfidfVectorizer(analyzer = 'word',
                             stop_words = stopWords,
                             max_df=0.95, 
                             min_df=0.001)

# Transform the training data (independent variables)
tfidf_mini__stop = tfidf_vectorizer_mini__stop.fit_transform(X_mini)

# Print the length of features of the tfidf_vectorizer
print( len(tfidf_vectorizer_mini__stop.get_feature_names()) )

3988


## Vectorize text using unigrams, bigrams and trigrams

In [11]:
# Initialize vectorizer using unigrams,bigrams and trigrams and customized stopwords 
count_vectorizer_mini__stop_ngram = CountVectorizer(analyzer = 'word',
                             stop_words = stopWords,
                             ngram_range = (1,3),
                             max_df=0.95, 
                             min_df=0.001)

# Transform the training data (independent variables)
count_mini__stop_ngram = count_vectorizer_mini__stop_ngram.fit_transform(X_mini)

# Print the length of features of the count_vectorizer
print( len(count_vectorizer_mini__stop_ngram.get_feature_names()) )

8437


In [12]:
# Initialize vectorizer using unigrams,bigrams and trigrams and customized stopwords 
tfidf_vectorizer_mini__stop_ngram = TfidfVectorizer(analyzer = 'word',
                             stop_words = stopWords,
                             ngram_range = (1,3),
                             max_df=0.95, 
                             min_df=0.001)

# Transform the training data (independent variables)
tfidf_mini__stop_ngram = tfidf_vectorizer_mini__stop_ngram.fit_transform(X_mini)

# Print the length of features of the tfidf_vectorizer
print( len(tfidf_vectorizer_mini__stop_ngram.get_feature_names()) )

8437


## Topic modelling 
### Using Latent Dirichlet Allocation (LDA)

In [13]:
# Initialize LDA model
lda = LatentDirichletAllocation(n_components=300,random_state=42) 

# Get topics for training data
lda_mini = lda.fit_transform(count_mini__stop_ngram)

# add topics to count vectorizer ngrams set 
count_mini__stop_ngram_lda = hstack((count_mini__stop_ngram, lda_mini))



In [14]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
display_topics(lda, tfidf_vectorizer_mini__stop_ngram.get_feature_names(), 10)

Topic 0:
roll beef pho spring egg spring roll soup vietnamese good broth
Topic 1:
buffet dessert group worth course selection large variety went main
Topic 2:
wa server table great wa great server wa party attentive took brought
Topic 3:
made up statement sam pulled pork pit one table didnt much year since liking whats not
Topic 4:
going price reasonable card going back back die city credit forever
Topic 5:
sitting around smothered dish wa almost inside amazing definitely consistent love pizza stuff risk
Topic 6:
ha good well definitely little bland tends go wa not offer ha not though wa take out pretty empty
Topic 7:
bitter taste bud deserves up get back again content chicken enchilada horseradish think better wa literally
Topic 8:
season price good time took fryer flair part town town good part time
Topic 9:
water soda generous product fountain canned bottled portion generous array pepsi
Topic 10:
friday friday saturday fresher no joke really meal quality saturday cool much
Topic 11:

Topic 288:
wa minute wait came said table got took time seated
Topic 289:
steak must medium cooked oyster rare york new york must try new
Topic 290:
feel like sausage feel like authentic pudding 50 sticky fine draft
Topic 291:
beer 10pm mozzarella stick wa good well wa least nutella way sweet back room get star like go
Topic 292:
pretty late party probably fan night stay pretty good able tempe
Topic 293:
bartender no idea issue wa appears seasoned hall cramped wa recommended again again hop
Topic 294:
salad blue cheese personal often chopped blue cheese grand garden keep up
Topic 295:
wa bit like more though even one lot okay better
Topic 296:
ny chinese style ny style tried ok nothing ive az enjoyed
Topic 297:
price location area better downtown not quality half small little
Topic 298:
choice return broccoli chow mixed darn inedible part meal would return curry chicken
Topic 299:
definitely come back decor wa wa mushy mac cheese lip sorry carne asada si limp one star


### Using Non-negative Matrix Factorization (NMF)

In [15]:
# Initialize NMF model
nmf = NMF(n_components=300, random_state=42)

# Get topics for training data
nmf_mini = nmf.fit_transform(tfidf_mini__stop_ngram)

# add topics to tfidf ngrams set 
tfidf_train_mini__stop_ngram_nmf = hstack((tfidf_mini__stop_ngram, nmf_mini))

In [16]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
display_topics(nmf, tfidf_vectorizer_mini__stop_ngram.get_feature_names(), 10)

Topic 0:
wa wa great wa not wa really wa nice wa amazing wa delicious wa wa great wa thought
Topic 1:
favorite one favorite new favorite favorite spot favorite go wa favorite favorite dish favorite eat favorite pizza favorite sushi
Topic 2:
great wa great great wa wa great wa always great not great great good great experience good great great drink
Topic 3:
pizza pizza wa slice best pizza good pizza topping great pizza pepperoni garlic crust
Topic 4:
burger burger wa best burger bun shake good burger burger joint burger fry joint patty
Topic 5:
chicken chicken wa fried chicken chicken sandwich chicken wing chicken salad jerk tender dry jerk chicken
Topic 6:
good good good not good always good good price good not good great great good pretty good good time
Topic 7:
sushi best sushi sushi wa ayce sashimi sushi chef good sushi japanese tempura great sushi
Topic 8:
de le et un la est pa pour que très
Topic 9:
great great great great price great beer price great great atmosphere love great 

## Word 2 Vec

In [6]:
# Create corpus of sentences from mini
sentence_corpus_mini = []
for review in X_mini:
    words = review.split("', '")
    words[0] = words[0][2:]
    words[-1] = words[-1][:-2]
    sentence_corpus_mini.append(' '.join(words))

# Create tokenized corpus
wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in sentence_corpus_mini]

In [9]:
# Define functions to create a feature array
def average_word_vectors(words, model, vocabulary, num_features):   
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.   
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    return feature_vector
    
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [18]:
# Create word 2 vec model 
feature_size = 100
w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size, window=5, min_count=10, workers=4)
w2v_dictionary = dict(zip(w2v_model.wv.index2word, w2v_model.wv.syn0))

  after removing the cwd from sys.path.


In [20]:
w2v_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=w2v_model,
                                             num_features=feature_size)
w2v_mini = pd.DataFrame(w2v_feature_array)
print(w2v_mini.shape)

  


(20136, 100)


In [21]:
# Create word 2 vec model with 200 features
feature_size_2 = 200
w2v_model_2 = word2vec.Word2Vec(tokenized_corpus, size=feature_size_2, window=5, min_count=10, workers=4)
w2v_dictionary_2 = dict(zip(w2v_model_2.wv.index2word, w2v_model_2.wv.syn0))

  after removing the cwd from sys.path.


In [22]:
w2v_feature_array_2 = averaged_word_vectorizer(corpus=tokenized_corpus, model=w2v_model_2,
                                             num_features=feature_size_2)
w2v_mini_2 = pd.DataFrame(w2v_feature_array_2)
print(w2v_mini_2.shape)

  


(20136, 200)


In [41]:
# Create word 2 vec model with 1000 features
feature_size_3 = 1000
w2v_model_3 = word2vec.Word2Vec(tokenized_corpus, size=feature_size_3, window=5, min_count=10, workers=4)
w2v_dictionary_3 = dict(zip(w2v_model_3.wv.index2word, w2v_model_3.wv.syn0))
w2v_feature_array_3 = averaged_word_vectorizer(corpus=tokenized_corpus, model=w2v_model_3,
                                             num_features=feature_size_3)
w2v_mini_3 = pd.DataFrame(w2v_feature_array_3)
print(w2v_mini_3.shape)

  after removing the cwd from sys.path.
  


(20136, 1000)


## Load Google news word 2 vec 

In [12]:
word2vec_path = os.path.join(dir, '02_processed_data','GoogleNews-vectors-negative300.bin.gz') 
word2vec_google = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [16]:
def get_average_word2vec(tokens_list, vector, num_features):
    if len(tokens_list)<1:
        return np.zeros(num_features)
    else:
        vectorized = [vector[word] if word in vector else np.zeros(num_features) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(corpus, model):
    num_features=300
    features = [get_average_word2vec(tokenized_sentence, model, num_features) for tokenized_sentence in corpus]
    return np.array(features)

In [17]:
w2v_feature_array_google = get_word2vec_embeddings(corpus=tokenized_corpus, model= word2vec_google)
w2v_mini_google = pd.DataFrame(w2v_feature_array_google)
print(w2v_mini_google.shape)

# Model selection using mini dataset

Using the mini dataset, a variety of models will be trained on a variety of feature sets to identify promising candidates. The promising combinations will then be tuned in the following section and trained on the full training data set. 

It should be noted that to assess model performance, the classification accuracy will be the primary metric. 
A Confusion matrix will be created using the best performing parameters from the cross validation.

In [20]:
# Define model tuning
def cross_validation_tuning(classifier, param_grid, X_trn, y_trn):
    classifier_cv = GridSearchCV(classifier, param_grid, cv=3)
    classifier_cv.fit(X_trn, y_trn)
    # Print the optimal parameters and best score
    print("Tuned Classifier Parameters: {}".format(classifier_cv.best_params_))
    print("Tuned Classifier Accuracy: {:.3f}".format(classifier_cv.best_score_))
    # Predict the labels
    pred = classifier_cv.predict(X_trn)
    # Compute accuracy
    score = metrics.accuracy_score(y_trn, pred)
    # Calculate and print the confusion matrix
    cm = metrics.confusion_matrix(y_trn, pred, labels=[1,2,3,4,5])
    print('For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.')
    print(cm)
    return classifier_cv

## Define models

In [21]:
# Define Naive_bayes model
def nb_model(X_trn, y_trn):
    # Create parameters
    param_grid = {'alpha': np.arange(0, 1, 0.333)}
    # Iterate over the alphas and print the corresponding score
    nb_classifier = MultinomialNB()
    tuned_nb_classifier = cross_validation_tuning(nb_classifier, param_grid, X_trn, y_trn)
    return tuned_nb_classifier

In [22]:
# Define Logistic regression model
def logreg_model(X_trn, y_trn):
    # Create parameters
    param_grid = {'C': [0.0001, 1, 100], 'penalty': ['l1', 'l2']} #param_grid = {'C': np.logspace(-5, 8, 15), 'penalty': ['l1', 'l2']}
    logreg_classifier = LogisticRegression()
    tuned_logreg_classifier = cross_validation_tuning(logreg_classifier, param_grid, X_trn, y_trn)
    return tuned_logreg_classifier

In [23]:
# Define SVM model
def svm_model(X_trn, y_trn):
    # Create parameters 
    param_grid = {'kernel': ['rbf', 'linear']} #{'kernel': ['rbf', 'linear'], 'C': [1, 10, 100], 'gamma': [0.1, 0.01]} 
    svm_classifier = SVC()
    tuned_svm_classifier = cross_validation_tuning(svm_classifier, param_grid, X_trn, y_trn)
    return tuned_svm_classifier

In [24]:
# Define Random forest model
def ranforest_model(X_trn, y_trn):
    # Create parameters
    param_grid = {"n_estimators": [150, 300, 500],
                  "min_samples_leaf": [5, 10]}
    # param_grid = {"n_estimators": [2, 10, 100, 300, 1000],"max_depth": [2, 10, 100, 300], "min_samples_split": [2, 10, 100],"min_samples_leaf": [1, 10, 100]}
    ranforest_classifier = RandomForestClassifier()
    tuned_ranforest_classifier = cross_validation_tuning(ranforest_classifier, param_grid, X_trn, y_trn)
    return tuned_ranforest_classifier

In [25]:
# Define SGD model 
def sgd_model(X_trn, y_trn):
    # Create parameters
    param_grid = {"penalty": ['l1', 'l2', 'elasticnet'],
                  "l1_ratio": [0.1, 0.3, 0.5] }
    sgd_classifier = SGDClassifier(random_state= 42, max_iter=4)
    tuned_sgd_classifier = cross_validation_tuning(sgd_classifier, param_grid, X_trn, y_trn)
    return tuned_sgd_classifier

In [26]:
# Define XGBoost model 
def xgb_model(X_trn, y_trn):
    # Create parameters
    param_grid = {'min_child_weight': [3],
                 'max_depth': [4]}
    xgb_classifier = XGBClassifier(learning_rate =0.2, seed=42)
    tuned_xgb_classifier = cross_validation_tuning(xgb_classifier, param_grid, X_trn, y_trn)
    return tuned_xgb_classifier

In [27]:
# Define neural network architecture
def construct_dnn(input_shape):
    dnn_model = Sequential()
    dnn_model.add(Dense(512, activation ='relu', input_shape=input_shape ))
    dnn_model.add(Dropout(0.3))
    dnn_model.add(Dense(512, activation ='relu'))
    dnn_model.add(Dropout(0.3))
#     dnn_model.add(Dense(512, activation ='relu'))
#     dnn_model.add(Dropout(0.2))
    dnn_model.add(Dense(5, activation='softmax'))
    dnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return dnn_model

# Build model
def dnn_model(X_trn, y_trn):
    n_cols = X_trn.shape[1]
    input_shape =(n_cols, )
    model = construct_dnn(input_shape)
    
    # Define early_stopping_monitor
    early_stopping_monitor = EarlyStopping(patience=2)
    # Define fit
    history = model.fit(X_trn, pd.get_dummies(y_trn), epochs=30, validation_split=0.3, callbacks=[early_stopping_monitor])
    return model, history

## Test model 

The baseline assumes review is a 5 star rating (the most common class of data). The corresponding baseline accuracy is ~ 36.7%.

In [31]:
# Calculate baseline
length = len(y_mini)
correct_pred = len(y_mini[y_mini == 5])
baseline_accuracy = correct_pred / length 
print(baseline_accuracy)

0.36730234406038936


In [32]:
feature_sets = {'count': count_mini, 
                'tfidf': tfidf_mini, 
                'count stop': count_mini__stop, 
                'tfidf stop': tfidf_mini__stop, 
                'count stop ngram': count_mini__stop_ngram, 
                'tfidf stop ngram': tfidf_mini__stop_ngram,
                'count stop ngram lda': count_mini__stop_ngram_lda.tocsr(), 
                'tfidf stop ngram nmf': tfidf_train_mini__stop_ngram_nmf.tocsr(),
                'word to vec': w2v_mini, 
                'word to vec 2': w2v_mini_2}
non_negative_feature_sets = feature_sets.copy()
del non_negative_feature_sets['word to vec']
del non_negative_feature_sets['word to vec 2']
w2v_feature_sets = {'word to vec': w2v_mini, 
                   'word to vec 2': w2v_mini_2}

In [42]:
w2v_feature_set_1000 = {'word to vec 3': w2v_mini_3}

In [19]:
w2v_feature_set_google = {'word to vec google': w2v_mini_google} 

In [28]:
# define test for feature sets
def test_features(model, sets):
    results = defaultdict(float)
    for key, x_values_mini in sets.items():
        print(key)
        model_instance = model(x_values_mini, y_mini)
        results[key] = model_instance.best_score_
        print('')
    print('--------------------------')
    print(results)
    return results

In [29]:
# define deep neural net tests for feature sets
def dnn_test_results(sets):
    results = defaultdict(float)
    for key, x_values_mini in sets.items():
        print(key)
        model_instance, history = dnn_model(x_values_mini, y_mini)
        results[key] = max(history.history['val_acc'])
        print('')
    print('--------------------------')
    print(results)
    return results

In [61]:
# run tests for Naive Bayes
NB_mini_results = test_features(nb_model, non_negative_feature_sets)

count


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Tuned Classifier Parameters: {'alpha': 0.99900000000000011}
Tuned Classifier Accuracy: 0.559
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1651  329  118   67   50]
 [ 450  992  345  109   83]
 [ 233  274 1571  584  266]
 [ 143  142  469 2992 1872]
 [ 119   75  146 1087 5969]]

tfidf


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Tuned Classifier Parameters: {'alpha': 0.33300000000000002}
Tuned Classifier Accuracy: 0.527
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1618  117  104  207  169]
 [ 446  517  257  531  228]
 [ 171   56  783 1342  576]
 [  67   11   47 3085 2408]
 [  47    2   17  758 6572]]

count stop


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Tuned Classifier Parameters: {'alpha': 0.99900000000000011}
Tuned Classifier Accuracy: 0.568
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1683  330  113   54   35]
 [ 450 1029  345   92   63]
 [ 230  292 1597  573  236]
 [ 132  148  501 3043 1794]
 [ 122   60  133 1078 6003]]

tfidf stop


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Tuned Classifier Parameters: {'alpha': 0.33300000000000002}
Tuned Classifier Accuracy: 0.536
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1652  129  103  193  138]
 [ 439  554  265  528  193]
 [ 158   54  866 1339  511]
 [  60    9   63 3138 2348]
 [  41    3   14  768 6570]]

count stop ngram


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Tuned Classifier Parameters: {'alpha': 0.99900000000000011}
Tuned Classifier Accuracy: 0.583
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1785  281   99   26   24]
 [ 392 1235  245   60   47]
 [ 200  274 1848  425  181]
 [ 118  125  477 3263 1635]
 [  96   51  133  996 6120]]

tfidf stop ngram


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Tuned Classifier Parameters: {'alpha': 0.33300000000000002}
Tuned Classifier Accuracy: 0.573
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1808  133   93  110   71]
 [ 391  915  312  262   99]
 [ 148   77 1428  951  324]
 [  69   20  110 3560 1859]
 [  39    6   22  763 6566]]

count stop ngram lda


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Tuned Classifier Parameters: {'alpha': 0.99900000000000011}
Tuned Classifier Accuracy: 0.583
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1782  280  101   27   25]
 [ 391 1232  248   60   48]
 [ 201  270 1832  433  192]
 [ 121  121  472 3227 1677]
 [  94   51  136  982 6133]]

tfidf stop ngram nmf


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Tuned Classifier Parameters: {'alpha': 0.33300000000000002}
Tuned Classifier Accuracy: 0.574
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1810  138   94  106   67]
 [ 398  924  311  249   97]
 [ 148   80 1445  931  324]
 [  75   18  113 3572 1840]
 [  41    6   24  804 6521]]

--------------------------
defaultdict(<class 'float'>, {'count': 0.55904847040127137, 'tfidf': 0.52731426301152162, 'count stop': 0.56778903456495833, 'tfidf stop': 0.53615415176797776, 'count stop ngram': 0.58263806118394912, 'tfidf stop ngram': 0.5733015494636472, 'count stop ngram lda': 0.58338299562971796, 'tfidf stop ngram nmf': 0.57354986094557014})


In [33]:
# run tests for logistic regression 
logreg_mini_results = test_features(logreg_model, non_negative_feature_sets)

count
Tuned Classifier Parameters: {'C': 1, 'penalty': 'l1'}
Tuned Classifier Accuracy: 0.540
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1957   78   44   65   71]
 [ 227 1315  195  133  109]
 [  83  130 1853  540  322]
 [  28   40  215 3670 1665]
 [  18   14   72  731 6561]]

tfidf
Tuned Classifier Parameters: {'C': 1, 'penalty': 'l1'}
Tuned Classifier Accuracy: 0.570
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1692  187  100  125  111]
 [ 463  708  366  292  150]
 [ 133  201 1183 1008  403]
 [  54   47  253 3195 2069]
 [  34   20   68  996 6278]]

count stop
Tuned Classifier Parameters: {'C': 1, 'penalty': 'l1'}
Tuned Classifier Accuracy: 0.548
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[2007   66   33   58   51]
 [ 216 1380  178  116   89]
 [  69  127 1926  544  262]
 [  29   44  195 3

In [35]:
logreg_w2v_mini_results = test_features(logreg_model, w2v_feature_sets)

word to vec
Tuned Classifier Parameters: {'C': 100, 'penalty': 'l2'}
Tuned Classifier Accuracy: 0.546
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1477  159  138  207  234]
 [ 583  279  379  503  235]
 [ 250  128  753 1362  435]
 [ 143   43  257 2775 2400]
 [ 130   12   72 1211 5971]]

word to vec 2
Tuned Classifier Parameters: {'C': 100, 'penalty': 'l1'}
Tuned Classifier Accuracy: 0.556
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1548  185  122  165  195]
 [ 550  393  395  456  185]
 [ 225  160  836 1307  400]
 [ 120   48  300 2918 2232]
 [ 106   21   61 1243 5965]]

--------------------------
defaultdict(<class 'float'>, {'word to vec': 0.54633492252681759, 'word to vec 2': 0.55572109654350421})


In [43]:
logreg_w2v_mini_results_1000 = test_features(logreg_model, w2v_feature_set_1000)

word to vec 3
Tuned Classifier Parameters: {'C': 100, 'penalty': 'l1'}
Tuned Classifier Accuracy: 0.560
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1585  168  120  168  174]
 [ 562  385  405  450  177]
 [ 219  158  859 1308  384]
 [ 105   45  313 2920 2235]
 [  96   22   54 1225 5999]]

--------------------------
defaultdict(<class 'float'>, {'word to vec 3': 0.55954509336511715})


In [30]:
logreg_w2v_mini_results_google = test_features(logreg_model, w2v_feature_set_google)

word to vec google
Tuned Classifier Parameters: {'C': 100, 'penalty': 'l2'}
Tuned Classifier Accuracy: 0.558
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1645  192  130  118  130]
 [ 553  473  419  366  168]
 [ 208  189  822 1301  408]
 [  72   50  279 2924 2293]
 [  67   16   80 1218 6015]]

--------------------------
defaultdict(<class 'float'>, {'word to vec google': 0.55790623758442592})


In [34]:
# run tests for SVM
svm_mini_results = test_features(svm_model, non_negative_feature_sets)

count
Tuned Classifier Parameters: {'kernel': 'linear'}
Tuned Classifier Accuracy: 0.510
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[2191   14    6    3    1]
 [  59 1865   43    7    5]
 [  17   49 2577  214   71]
 [  16   25  181 4328 1068]
 [  10   17   81  608 6680]]

tfidf
Tuned Classifier Parameters: {'kernel': 'linear'}
Tuned Classifier Accuracy: 0.563
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1885  170   78   48   34]
 [ 276 1227  289  132   55]
 [ 108  162 1767  661  230]
 [  41   63  266 3800 1448]
 [  28   24  109 1000 6235]]

count stop
Tuned Classifier Parameters: {'kernel': 'linear'}
Tuned Classifier Accuracy: 0.517
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[2203    9    3    0    0]
 [  41 1903   31    2    2]
 [  11   41 2650  183   43]
 [  16   20  178 4401 1003]
 [  1

In [36]:
svm_w2v_mini_results = test_features(svm_model, w2v_feature_sets)

word to vec
Tuned Classifier Parameters: {'kernel': 'linear'}
Tuned Classifier Accuracy: 0.552
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1462  300  150  140  163]
 [ 555  520  435  351  118]
 [ 241  297  924 1194  272]
 [ 160  110  433 3036 1879]
 [ 149   48  115 1560 5524]]

word to vec 2
Tuned Classifier Parameters: {'kernel': 'linear'}
Tuned Classifier Accuracy: 0.555
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1499  280  150  133  153]
 [ 564  532  437  335  111]
 [ 229  286  963 1194  256]
 [ 155  112  442 3048 1861]
 [ 144   42  118 1585 5507]]

--------------------------
defaultdict(<class 'float'>, {'word to vec': 0.55174811283273739, 'word to vec 2': 0.5553237981724275})


In [44]:
svm_w2v_mini_results_1000 = test_features(svm_model, w2v_feature_set_1000)

word to vec 3
Tuned Classifier Parameters: {'kernel': 'linear'}
Tuned Classifier Accuracy: 0.554
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1492  287  150  141  145]
 [ 562  520  439  346  112]
 [ 233  275  939 1215  266]
 [ 148  115  448 3066 1841]
 [ 135   47  107 1590 5517]]

--------------------------
defaultdict(<class 'float'>, {'word to vec 3': 0.5539332538736591})


In [31]:
svm_w2v_mini_results_google = test_features(svm_model, w2v_feature_set_google)

word to vec google
Tuned Classifier Parameters: {'kernel': 'linear'}
Tuned Classifier Accuracy: 0.547
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1586  231  134  172   92]
 [ 572  395  372  525  115]
 [ 234  174  565 1680  275]
 [ 111   53  175 3394 1885]
 [  81   26   46 1642 5601]]

--------------------------
defaultdict(<class 'float'>, {'word to vec google': 0.54673222089789431})


In [35]:
# run tests for random forest
rf_mini_results = test_features(ranforest_model, non_negative_feature_sets)

count
Tuned Classifier Parameters: {'min_samples_leaf': 5, 'n_estimators': 150}
Tuned Classifier Accuracy: 0.510
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1650    8   58  172  327]
 [ 312  622  126  510  409]
 [  89    3 1201  871  764]
 [  20    0    6 3487 2105]
 [  11    1    9  174 7201]]

tfidf
Tuned Classifier Parameters: {'min_samples_leaf': 5, 'n_estimators': 300}
Tuned Classifier Accuracy: 0.519
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1787    6   43  131  248]
 [ 256  956   81  359  327]
 [  72    1 1671  572  612]
 [  14    0    4 4009 1591]
 [   7    1    4   77 7307]]

count stop
Tuned Classifier Parameters: {'min_samples_leaf': 5, 'n_estimators': 150}
Tuned Classifier Accuracy: 0.511
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1665   10   49  169  322]
 [ 270  698  105 

In [37]:
rf_w2v_mini_results = test_features(ranforest_model, w2v_feature_sets)

word to vec
Tuned Classifier Parameters: {'min_samples_leaf': 5, 'n_estimators': 300}
Tuned Classifier Accuracy: 0.505
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[2059    0    7   56   93]
 [  28 1759    0   75  117]
 [  33    1 2697   31  166]
 [  24    1    0 5484  109]
 [  18    0    0    0 7378]]

word to vec 2
Tuned Classifier Parameters: {'min_samples_leaf': 5, 'n_estimators': 300}
Tuned Classifier Accuracy: 0.513
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[2089    0    3   51   72]
 [  22 1790    0   69   98]
 [  33    0 2723   22  150]
 [  23    0    0 5493  102]
 [   9    0    1    0 7386]]

--------------------------
defaultdict(<class 'float'>, {'word to vec': 0.50481724274930473, 'word to vec 2': 0.51276321017083826})


In [45]:
rf_w2v_mini_results_1000 = test_features(ranforest_model, w2v_feature_set_1000)

word to vec 3
Tuned Classifier Parameters: {'min_samples_leaf': 5, 'n_estimators': 300}
Tuned Classifier Accuracy: 0.518
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[2103    0    4   37   71]
 [  14 1841    0   53   71]
 [  32    0 2757   11  128]
 [  22    0    0 5509   87]
 [  10    0    0    0 7386]]

--------------------------
defaultdict(<class 'float'>, {'word to vec 3': 0.51822606277314265})


In [32]:
rf_w2v_mini_results_google = test_features(ranforest_model, w2v_feature_set_google)

word to vec google
Tuned Classifier Parameters: {'min_samples_leaf': 5, 'n_estimators': 500}
Tuned Classifier Accuracy: 0.495
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[2150    0    2    9   54]
 [   6 1877    1   22   73]
 [  10    0 2811    2  105]
 [   9    0    0 5586   23]
 [   1    0    0    0 7395]]

--------------------------
defaultdict(<class 'float'>, {'word to vec google': 0.49543106873261822})


In [45]:
# run tests for sgd
sgd_mini_results = test_features(sgd_model, non_negative_feature_sets)

count
Tuned Classifier Parameters: {'l1_ratio': 0.5, 'penalty': 'elasticnet'}
Tuned Classifier Accuracy: 0.510
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1929   63   56   75   92]
 [ 442  821  315  241  160]
 [ 205  104 1540  674  405]
 [  97   49  393 3275 1804]
 [  72   19  135  942 6228]]

tfidf
Tuned Classifier Parameters: {'l1_ratio': 0.1, 'penalty': 'l2'}
Tuned Classifier Accuracy: 0.555
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1959   49   74   55   78]
 [ 534  777  314  202  152]
 [ 168   94 1460  761  445]
 [  71   27  230 2935 2355]
 [  38   10   76  562 6710]]

count stop
Tuned Classifier Parameters: {'l1_ratio': 0.5, 'penalty': 'elasticnet'}
Tuned Classifier Accuracy: 0.516
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1816  218   46   84   51]
 [ 268 1251  202  180   78]
 [ 

In [38]:
sgd_w2v_mini_results = test_features(sgd_model, w2v_feature_sets)

word to vec
Tuned Classifier Parameters: {'l1_ratio': 0.1, 'penalty': 'l2'}
Tuned Classifier Accuracy: 0.458
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1314  220  134    7  540]
 [ 570  364  449    8  588]
 [ 316  358  913   41 1300]
 [ 174  196  806   83 4359]
 [ 101   67  248   38 6942]]

word to vec 2
Tuned Classifier Parameters: {'l1_ratio': 0.1, 'penalty': 'l1'}
Tuned Classifier Accuracy: 0.482
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1257   97  207    1  653]
 [ 472  188  642    7  670]
 [ 227  124 1197   46 1334]
 [ 109   71  912  138 4388]
 [  67   22  253   41 7013]]

--------------------------
defaultdict(<class 'float'>, {'word to vec': 0.45783671036948748, 'word to vec 2': 0.48162495033770364})


In [46]:
sgd_w2v_mini_results_1000 = test_features(sgd_model, w2v_feature_set_1000)

word to vec 3
Tuned Classifier Parameters: {'l1_ratio': 0.1, 'penalty': 'l1'}
Tuned Classifier Accuracy: 0.494
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1536    1  131    8  539]
 [ 756    9  506   22  686]
 [ 390    2  933  156 1447]
 [ 185    1  511  276 4645]
 [ 100    0  109   88 7099]]

--------------------------
defaultdict(<class 'float'>, {'word to vec 3': 0.49359356376638858})


In [33]:
sgd_w2v_mini_results_google = test_features(sgd_model, w2v_feature_set_google)

word to vec google
Tuned Classifier Parameters: {'l1_ratio': 0.1, 'penalty': 'l1'}
Tuned Classifier Accuracy: 0.516
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1560    3  100    3  549]
 [ 684   14  398    5  878]
 [ 296    3  628   57 1944]
 [  96    2  315   99 5106]
 [  45    0   45   36 7270]]

--------------------------
defaultdict(<class 'float'>, {'word to vec google': 0.51589193484306717})


In [45]:
# run tests for XG Boost
xgb_mini_results = test_features(xgb_model, non_negative_feature_sets)

count
Tuned Classifier Parameters: {'max_depth': 4, 'min_child_weight': 3}
Tuned Classifier Accuracy: 0.547
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1571  121  102  166  255]
 [ 300  849  234  338  258]
 [ 116  116 1279  904  513]
 [  52   19  164 3160 2223]
 [  32    9   46  935 6374]]

tfidf
Tuned Classifier Parameters: {'max_depth': 4, 'min_child_weight': 3}
Tuned Classifier Accuracy: 0.544
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1638  105   89  186  197]
 [ 290  983  190  322  194]
 [ 107   94 1384  872  471]
 [  57   23  113 3423 2002]
 [  31    9   56  915 6385]]

count stop
Tuned Classifier Parameters: {'max_depth': 4, 'min_child_weight': 3}
Tuned Classifier Accuracy: 0.549
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1597  123  103  168  224]
 [ 328  880  247  302  222]
 [ 1

In [39]:
xgb_w2v_mini_results = test_features(xgb_model, w2v_feature_sets)

word to vec
Tuned Classifier Parameters: {'max_depth': 4, 'min_child_weight': 3}
Tuned Classifier Accuracy: 0.526
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1823   99   78  107  108]
 [ 222 1164  178  279  136]
 [ 150  126 1590  760  302]
 [ 105   96  207 3824 1386]
 [  83   44   89  849 6331]]

word to vec 2
Tuned Classifier Parameters: {'max_depth': 4, 'min_child_weight': 3}
Tuned Classifier Accuracy: 0.532
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1867   75   81   98   94]
 [ 200 1251  175  226  127]
 [ 140  114 1674  721  279]
 [  91   75  207 3961 1284]
 [  60   43   96  833 6364]]

--------------------------
defaultdict(<class 'float'>, {'word to vec': 0.52552642034167663, 'word to vec 2': 0.53193285657528799})


In [47]:
xgb_w2v_mini_results_1000 = test_features(xgb_model, w2v_feature_set_1000)

word to vec 3
Tuned Classifier Parameters: {'max_depth': 4, 'min_child_weight': 3}
Tuned Classifier Accuracy: 0.536
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1957   61   56   79   62]
 [ 159 1382  139  203   96]
 [ 112  111 1893  590  222]
 [  79   80  182 4121 1156]
 [  55   37   99  702 6503]]

--------------------------
defaultdict(<class 'float'>, {'word to vec 3': 0.53570719110051646})


In [34]:
xgb_w2v_mini_results_google = test_features(xgb_model, w2v_feature_set_google)

word to vec google
Tuned Classifier Parameters: {'max_depth': 4, 'min_child_weight': 3}
Tuned Classifier Accuracy: 0.534
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[1968   59   61   68   59]
 [ 140 1416  137  191   95]
 [  98  101 1791  679  259]
 [  58   61  147 4191 1161]
 [  40   32   84  645 6595]]

--------------------------
defaultdict(<class 'float'>, {'word to vec google': 0.53446563369090183})


In [118]:
# run tests for deep neural nets
dnn_mini_results = dnn_test_results(non_negative_feature_sets)

count
Train on 14095 samples, validate on 6041 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30

tfidf
Train on 14095 samples, validate on 6041 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30

count stop
Train on 14095 samples, validate on 6041 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30

tfidf stop
Train on 14095 samples, validate on 6041 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30

count stop ngram
Train on 14095 samples, validate on 6041 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30

tfidf stop ngram
Train on 14095 samples, validate on 6041 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30

count stop ngram lda
Train on 14095 samples, validate on 6041 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30

tfidf stop ngram nmf
Train on 14095 samples, validate on 6041 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30

--------------------------
defaultdict(<class 'float'>, {'count': 0.58715444463823951, 'tfidf': 0.5816917728408838, 'count stop': 0.58864426411573623, 'tfidf stop': 0.58169177298395069, 'count stop ngram': 0.60271478241947463,

In [40]:
dnn_w2v_mini_results = dnn_test_results(w2v_feature_sets)

word to vec
Train on 14095 samples, validate on 6041 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30

word to vec 2
Train on 14095 samples, validate on 6041 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30

--------------------------
defaultdict(<class 'float'>, {'word to vec': 0.55719251775066692, 'word to vec 2': 0.55901340829716317})


In [48]:
dnn_w2v_mini_results_1000 = dnn_test_results(w2v_feature_set_1000)

word to vec 3
Train on 14095 samples, validate on 6041 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30

--------------------------
defaultdict(<class 'float'>, {'word to vec 3': 0.56017215688326094})


In [35]:
dnn_w2v_mini_results_google = dnn_test_results(w2v_feature_set_google)

word to vec google
Train on 14095 samples, validate on 6041 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30

--------------------------
defaultdict(<class 'float'>, {'word to vec google': 0.56977321631050803})
