# Import dependencies and determine working directory

In [1]:
# Import libraries
import os
import pandas as pd
from collections import Counter, defaultdict
import numpy as np
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

# Import topic model 
from sklearn.decomposition import NMF, LatentDirichletAllocation

# Get stop words 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Import NLP vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import word2vec

# Import models 
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import keras
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.callbacks import EarlyStopping

[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


Using TensorFlow backend.


In [2]:
# get current directory
dir = os.path.dirname(os.path.abspath('__file__'))

# Define data sets

## Load pre-processed data

In [3]:
# Load df from a csv - all text to lower case, tokenize into list of strings, remove punctuation and lemmatize
preprocessed_path = os.path.join(dir, '02_processed_data','review_text_stars.csv')
preprocessed_df = pd.read_csv(preprocessed_path, index_col = False)
preprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2876509 entries, 0 to 2876508
Data columns (total 2 columns):
stars_review        int64
processed_review    object
dtypes: int64(1), object(1)
memory usage: 43.9+ MB


## Split train and test data

In [4]:
# Create training and test sets using a fixed seed for reproducibility 
X_train, X_test, y_train, y_test = train_test_split(preprocessed_df.processed_review, preprocessed_df.stars_review, test_size = 0.3, random_state = 42)

## Create mini dataset

In [5]:
# Create a mini data set for feature and model selection (for manageable training times)
__, X_mini, ___, y_mini = train_test_split(X_train, y_train, test_size = 0.05, random_state = 42)
print(len(X_mini))

100678


# Feature selection using mini dataset
Using the mini dataset, various types of feature engineering will be performed and tested on a variety of models in the next stage. 

## Count vectorizer

In [9]:
# Initialize vectorizer using unigrams and remove all standard stopwords 
# Source: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
count_vectorizer_mini = CountVectorizer(analyzer = 'word',
                             stop_words = 'english',
                             max_df=0.95, 
                             min_df=0.001)

# Transform the training data (independent variables)
count_mini = count_vectorizer_mini.fit_transform(X_mini)

# Print the length of features of the count_vectorizer
print( len(count_vectorizer_mini.get_feature_names()) )

3864


## Tfidf vectorizer (weighted vectorizer)

In [10]:
# Initialize vectorizer using unigrams and remove all standard stopwords 
tfidf_vectorizer_mini = TfidfVectorizer(analyzer = 'word',
                             stop_words = 'english',
                             max_df=0.95, 
                             min_df=0.001)

# Transform the training data (independent variables)
tfidf_mini = tfidf_vectorizer_mini.fit_transform(X_mini)

# Print the length of features of the tfidf_vectorizer
print( len(tfidf_vectorizer_mini.get_feature_names()) )

3864


## Update stop words

In [11]:
stopWords = set(stopwords.words('english'))

# Add neutral words related to restaurants to list of stop words
stopWords.update(['restaurant', 'place', 'bar', 'service', 'food', 'lunch', 'breakfast', 'dinner', 'price', 'order', 'ordered'])

# Remove stopwords that might reflect sentiment
stopWords = [word for word in stopWords if word not in ['above', 'not', 'below', 't', 'off', 'no', 'again', 'against', 'under', 'hadn', 'up', 'shan', 'more', 'hasn', 'won','couldn', 'wasn', 'mustn', 'out', 'don','down', 'haven', 'price', 'mightn', 'isn', 'wouldn', 'needn', 'shouldn', 'weren', 'aren', 'didn', 'ain', 'doesn']]

## Feature set with new stop words

In [12]:
# Initialize vectorizer using unigrams and customized stopwords 
count_vectorizer_mini__stop = CountVectorizer(analyzer = 'word',
                             stop_words = stopWords,
                             max_df=0.95, 
                             min_df=0.001)

# Transform the training data (independent variables)
count_mini__stop = count_vectorizer_mini__stop.fit_transform(X_mini)

# Print the length of features of the count_vectorizer
print( len(count_vectorizer_mini__stop.get_feature_names()) )

4024


In [13]:
# Initialize vectorizer using unigrams and customized stopwords 
tfidf_vectorizer_mini__stop = TfidfVectorizer(analyzer = 'word',
                             stop_words = stopWords,
                             max_df=0.95, 
                             min_df=0.001)

# Transform the training data (independent variables)
tfidf_mini__stop = tfidf_vectorizer_mini__stop.fit_transform(X_mini)

# Print the length of features of the tfidf_vectorizer
print( len(tfidf_vectorizer_mini__stop.get_feature_names()) )

4024


## Vectorize text using unigrams, bigrams and trigrams

In [14]:
# Initialize vectorizer using unigrams,bigrams and trigrams and customized stopwords 
count_vectorizer_mini__stop_ngram = CountVectorizer(analyzer = 'word',
                             stop_words = stopWords,
                             ngram_range = (1,3),
                             max_df=0.95, 
                             min_df=0.001)

# Transform the training data (independent variables)
count_mini__stop_ngram = count_vectorizer_mini__stop_ngram.fit_transform(X_mini)

# Print the length of features of the count_vectorizer
print( len(count_vectorizer_mini__stop_ngram.get_feature_names()) )

8448


In [15]:
# Initialize vectorizer using unigrams,bigrams and trigrams and customized stopwords 
tfidf_vectorizer_mini__stop_ngram = TfidfVectorizer(analyzer = 'word',
                             stop_words = stopWords,
                             ngram_range = (1,3),
                             max_df=0.95, 
                             min_df=0.001)

# Transform the training data (independent variables)
tfidf_mini__stop_ngram = tfidf_vectorizer_mini__stop_ngram.fit_transform(X_mini)

# Print the length of features of the tfidf_vectorizer
print( len(tfidf_vectorizer_mini__stop_ngram.get_feature_names()) )

8448


## Topic modelling 
### Using Latent Dirichlet Allocation (LDA)

In [16]:
# Initialize LDA model
lda = LatentDirichletAllocation(n_components=300,random_state=42) 

# Get topics for training data
lda_mini = lda.fit_transform(count_mini__stop_ngram)

# add topics to count vectorizer ngrams set 
count_mini__stop_ngram_lda = hstack((count_mini__stop_ngram, lda_mini))



In [17]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
display_topics(lda, tfidf_vectorizer_mini__stop_ngram.get_feature_names(), 10)

Topic 0:
était get chance genuine genuinely george german gesture get get again get another
Topic 1:
best ever ive one ive ever bartender one best best ive eaten gotten
Topic 2:
était get chance genuine genuinely george german gesture get get again get another
Topic 3:
three average korean although expecting would japanese give star hoping
Topic 4:
était get chance genuine genuinely george german gesture get get again get another
Topic 5:
meat disappointed giving not star meat wa bone mention english wa disappointed
Topic 6:
ground noodle dish didnt give went saturday ground beef crumb wa ok wa pretty quick tasted like wa surprised see
Topic 7:
got far yummy else stopped everything serve calamari cuisine chewy
Topic 8:
again cream ice ice cream back again try though wa back cant vanilla
Topic 9:
wa minute good wife not finish margarita couldnt finish salsa wa not
Topic 10:
était get chance genuine genuinely george german gesture get get again get another
Topic 11:
need oh music might l

### Using Non-negative Matrix Factorization (NMF)

In [18]:
# Initialize NMF model
nmf = NMF(n_components=300, random_state=42)

# Get topics for training data
nmf_mini = nmf.fit_transform(tfidf_mini__stop_ngram)

# add topics to tfidf ngrams set 
tfidf_train_mini__stop_ngram_nmf = hstack((tfidf_mini__stop_ngram, nmf_mini))

In [19]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
display_topics(nmf, tfidf_vectorizer_mini__stop_ngram.get_feature_names(), 10)

Topic 0:
wa wa not wa nice wa delicious thought wa amazing wa wa wa excellent wa pretty wa also
Topic 1:
always always good always great always get always friendly staff always always fresh ha always time always always come
Topic 2:
great great great great price great atmosphere always great not great good great great staff great experience ha great
Topic 3:
pizza pizza wa slice best pizza good pizza pepperoni great pizza topping pizza good oven
Topic 4:
however however wa good however extremely great however however not overall disappointing rather wa extremely
Topic 5:
burger burger wa bun best burger shake patty good burger burger joint great burger burger fry
Topic 6:
good good good not good good price always good great good pretty good good not good great good time
Topic 7:
sushi sushi wa best sushi ayce sashimi japanese sushi chef great sushi good sushi sushi roll
Topic 8:
night friday late late night friday night night wa last night saturday saturday night date
Topic 9:
taco fis

## Word 2 Vec

In [6]:
# Create corpus of sentences from mini
sentence_corpus_mini = []
for review in X_mini:
    words = review.split("', '")
    words[0] = words[0][2:]
    words[-1] = words[-1][:-2]
    sentence_corpus_mini.append(' '.join(words))

# Create tokenized corpus
wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in sentence_corpus_mini]

In [7]:
# Define functions to create a feature array
def average_word_vectors(words, model, vocabulary, num_features):   
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.   
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    return feature_vector
    
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [23]:
# Create word 2 vec model 
feature_size = 100
w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size, window=5, min_count=10, workers=4)
# w2v_dictionary = dict(zip(w2v_model.wv.index2word, w2v_model.wv.syn0))
w2v_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=w2v_model,
                                             num_features=feature_size)
w2v_mini = pd.DataFrame(w2v_feature_array)
print(w2v_mini.shape)

  after removing the cwd from sys.path.
  


(100678, 100)


In [21]:
# Create word 2 vec model with 200 features
feature_size_2 = 200
w2v_model_2 = word2vec.Word2Vec(tokenized_corpus, size=feature_size_2, window=5, min_count=10, workers=4)
# w2v_dictionary_2 = dict(zip(w2v_model_2.wv.index2word, w2v_model_2.wv.syn0))
w2v_feature_array_2 = averaged_word_vectorizer(corpus=tokenized_corpus, model=w2v_model_2,
                                             num_features=feature_size_2)
w2v_mini_2 = pd.DataFrame(w2v_feature_array_2)
print(w2v_mini_2.shape)

  after removing the cwd from sys.path.
  


(100678, 200)


In [22]:
# Create word 2 vec model with 1000 features
feature_size_3 = 1000
w2v_model_3 = word2vec.Word2Vec(tokenized_corpus, size=feature_size_3, window=5, min_count=10, workers=4)
# w2v_dictionary_3 = dict(zip(w2v_model_3.wv.index2word, w2v_model_3.wv.syn0))
w2v_feature_array_3 = averaged_word_vectorizer(corpus=tokenized_corpus, model=w2v_model_3,
                                             num_features=feature_size_3)
w2v_mini_3 = pd.DataFrame(w2v_feature_array_3)
print(w2v_mini_3.shape)

  after removing the cwd from sys.path.
  


(100678, 1000)


# Model selection using mini dataset

Using the mini dataset, a variety of models will be trained on a variety of feature sets to identify promising candidates. The promising combinations will then be tuned in the following section and trained on the full training data set. 

It should be noted that to assess model performance, the classification accuracy will be the primary metric. 
A Confusion matrix will be created using the best performing parameters from the cross validation.

In [24]:
# Define model tuning
def cross_validation_tuning(classifier, param_grid, X_trn, y_trn):
    classifier_cv = GridSearchCV(classifier, param_grid, cv=3)
    classifier_cv.fit(X_trn, y_trn)
    # Print the optimal parameters and best score
    print("Tuned Classifier Parameters: {}".format(classifier_cv.best_params_))
    print("Tuned Classifier Accuracy: {:.3f}".format(classifier_cv.best_score_))
    # Predict the labels
    pred = classifier_cv.predict(X_trn)
    # Compute accuracy
    score = metrics.accuracy_score(y_trn, pred)
    # Calculate and print the confusion matrix
    cm = metrics.confusion_matrix(y_trn, pred, labels=[1,2,3,4,5])
    print('For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.')
    print(cm)
    return classifier_cv

## Define models

In [25]:
# Define Naive_bayes model
def nb_model(X_trn, y_trn):
    # Create parameters
    param_grid = {'alpha': np.arange(0, 1, 0.333)}
    # Iterate over the alphas and print the corresponding score
    nb_classifier = MultinomialNB()
    tuned_nb_classifier = cross_validation_tuning(nb_classifier, param_grid, X_trn, y_trn)
    return tuned_nb_classifier

In [26]:
# Define Logistic regression model
def logreg_model(X_trn, y_trn):
    # Create parameters
    param_grid = {'C': [0.0001, 1, 100], 'penalty': ['l1', 'l2']} #param_grid = {'C': np.logspace(-5, 8, 15), 'penalty': ['l1', 'l2']}
    logreg_classifier = LogisticRegression()
    tuned_logreg_classifier = cross_validation_tuning(logreg_classifier, param_grid, X_trn, y_trn)
    return tuned_logreg_classifier

In [27]:
# Define SVM model
def svm_model(X_trn, y_trn):
    # Create parameters 
    param_grid = {'kernel': ['rbf', 'linear']} #{'kernel': ['rbf', 'linear'], 'C': [1, 10, 100], 'gamma': [0.1, 0.01]} 
    svm_classifier = SVC()
    tuned_svm_classifier = cross_validation_tuning(svm_classifier, param_grid, X_trn, y_trn)
    return tuned_svm_classifier

In [28]:
# Define Random forest model
def ranforest_model(X_trn, y_trn):
    # Create parameters
    param_grid = {"n_estimators": [150, 300, 500],
                  "min_samples_leaf": [5, 10]}
    # param_grid = {"n_estimators": [2, 10, 100, 300, 1000],"max_depth": [2, 10, 100, 300], "min_samples_split": [2, 10, 100],"min_samples_leaf": [1, 10, 100]}
    ranforest_classifier = RandomForestClassifier()
    tuned_ranforest_classifier = cross_validation_tuning(ranforest_classifier, param_grid, X_trn, y_trn)
    return tuned_ranforest_classifier

In [29]:
# Define SGD model 
def sgd_model(X_trn, y_trn):
    # Create parameters
    param_grid = {"penalty": ['l1', 'l2', 'elasticnet'],
                  "l1_ratio": [0.1, 0.3, 0.5] }
    sgd_classifier = SGDClassifier(random_state= 42, max_iter=4)
    tuned_sgd_classifier = cross_validation_tuning(sgd_classifier, param_grid, X_trn, y_trn)
    return tuned_sgd_classifier

In [30]:
# Define XGBoost model 
def xgb_model(X_trn, y_trn):
    # Create parameters
    param_grid = {'min_child_weight': [3],
                 'max_depth': [4]}
    xgb_classifier = XGBClassifier(learning_rate =0.2, seed=42)
    tuned_xgb_classifier = cross_validation_tuning(xgb_classifier, param_grid, X_trn, y_trn)
    return tuned_xgb_classifier

In [31]:
# Define neural network architecture
def construct_dnn(input_shape):
    dnn_model = Sequential()
    dnn_model.add(Dense(512, activation ='relu', input_shape=input_shape ))
    dnn_model.add(Dropout(0.3))
    dnn_model.add(Dense(512, activation ='relu'))
    dnn_model.add(Dropout(0.3))
#     dnn_model.add(Dense(512, activation ='relu'))
#     dnn_model.add(Dropout(0.2))
    dnn_model.add(Dense(5, activation='softmax'))
    dnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return dnn_model

# Build model
def dnn_model(X_trn, y_trn):
    n_cols = X_trn.shape[1]
    input_shape =(n_cols, )
    model = construct_dnn(input_shape)
    
    # Define early_stopping_monitor
    early_stopping_monitor = EarlyStopping(patience=2)
    # Define fit
    history = model.fit(X_trn, pd.get_dummies(y_trn), epochs=30, validation_split=0.3, callbacks=[early_stopping_monitor])
    return model, history

## Test model 

The baseline assumes review is a 5 star rating (the most common class of data). The corresponding baseline accuracy is ~ 36.7%.

In [30]:
# Calculate baseline
length = len(y_mini)
correct_pred = len(y_mini[y_mini == 5])
baseline_accuracy = correct_pred / length 
print(baseline_accuracy)

0.36932596992391586


In [37]:
feature_sets = {'count': count_mini, 
                'tfidf': tfidf_mini, 
                'count stop': count_mini__stop, 
                'tfidf stop': tfidf_mini__stop, 
                'count stop ngram': count_mini__stop_ngram, 
                'tfidf stop ngram': tfidf_mini__stop_ngram,
                'count stop ngram lda': count_mini__stop_ngram_lda.tocsr(), 
                'tfidf stop ngram nmf': tfidf_train_mini__stop_ngram_nmf.tocsr(),
                'word to vec': w2v_mini, 
                'word to vec 2': w2v_mini_2, 
                'word to vec 3': w2v_mini_3}
non_negative_feature_sets = feature_sets.copy()
del non_negative_feature_sets['word to vec']
del non_negative_feature_sets['word to vec 2']
del non_negative_feature_sets['word to vec 3']

In [34]:
# define test for feature sets
def test_features(model, sets):
    results = defaultdict(float)
    for key, x_values_mini in sets.items():
        print(key)
        model_instance = model(x_values_mini, y_mini)
        results[key] = model_instance.best_score_
        print('')
    print('--------------------------')
    print(results)
    return results

In [33]:
# define deep neural net tests for feature sets
def dnn_test_results(sets):
    results = defaultdict(float)
    for key, x_values_mini in sets.items():
        print(key)
        model_instance, history = dnn_model(x_values_mini, y_mini)
        results[key] = max(history.history['val_acc'])
        print('')
    print('--------------------------')
    print(results)
    return results

In [None]:
# run tests for Naive Bayes
NB_mini_results = test_features(nb_model, non_negative_feature_sets)

count


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Tuned Classifier Parameters: {'alpha': 0.66600000000000004}
Tuned Classifier Accuracy: 0.580
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 8099  2080   594   222   276]
 [ 2621  3829  2388   656   361]
 [ 1330  1785  6371  3815  1278]
 [  646   747  3059 13365  9973]
 [  680   325   804  6087 29287]]

tfidf


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Tuned Classifier Parameters: {'alpha': 0.0}
Tuned Classifier Accuracy: 0.555
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 7964   924   582   912   889]
 [ 2505  1861  1832  2622  1035]
 [ 1150   431  2759  7601  2638]
 [  404    76   422 13522 13366]
 [  280    19    66  4593 32225]]

count stop


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Tuned Classifier Parameters: {'alpha': 0.66600000000000004}
Tuned Classifier Accuracy: 0.590
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 8176  2114   587   184   210]
 [ 2612  3977  2442   547   277]
 [ 1270  1825  6694  3734  1056]
 [  631   754  3043 13626  9736]
 [  655   295   747  6104 29382]]

tfidf stop


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Tuned Classifier Parameters: {'alpha': 0.0}
Tuned Classifier Accuracy: 0.564
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 8078   941   570   896   786]
 [ 2504  1911  1971  2582   887]
 [ 1092   452  3121  7626  2288]
 [  390    69   464 13897 12970]
 [  267    18    62  4629 32207]]

count stop ngram


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Tuned Classifier Parameters: {'alpha': 0.33300000000000002}
Tuned Classifier Accuracy: 0.606
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 8402  2129   472   129   139]
 [ 2394  4664  2196   394   207]
 [ 1075  1863  7607  3151   883]
 [  494   656  3242 14355  9043]
 [  511   280   728  6373 29291]]

tfidf stop ngram


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Tuned Classifier Parameters: {'alpha': 0.0}
Tuned Classifier Accuracy: 0.602
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 8538  1332   596   450   355]
 [ 2293  3436  2335  1373   418]
 [  952   845  5489  5925  1368]
 [  281   143  1107 15697 10562]
 [  230    41   159  5543 31210]]

count stop ngram lda


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Tuned Classifier Parameters: {'alpha': 0.33300000000000002}
Tuned Classifier Accuracy: 0.606
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 8401  2110   468   132   160]
 [ 2407  4628  2188   411   221]
 [ 1078  1855  7556  3153   937]
 [  493   646  3229 14211  9211]
 [  506   282   727  6262 29406]]

tfidf stop ngram nmf


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


Tuned Classifier Parameters: {'alpha': 0.0}
Tuned Classifier Accuracy: 0.603
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 8545  1343   593   444   346]
 [ 2296  3461  2334  1352   412]
 [  955   853  5564  5858  1349]
 [  282   148  1136 15707 10517]
 [  231    43   167  5581 31161]]

--------------------------
defaultdict(<class 'float'>, {'count': 0.58025586523371542, 'tfidf': 0.5547289378017044, 'count stop': 0.59016865650886985, 'tfidf stop': 0.56352927153896581, 'count stop ngram': 0.60640855003079119, 'tfidf stop ngram': 0.60210770972804384, 'count stop ngram lda': 0.60632908877808456, 'tfidf stop ngram nmf': 0.60290232225511031})


In [None]:
# run tests for logistic regression 
logreg_mini_results = test_features(logreg_model, feature_sets)

count
Tuned Classifier Parameters: {'C': 1, 'penalty': 'l1'}
Tuned Classifier Accuracy: 0.580
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 9032   912   404   342   581]
 [ 2217  4029  1740  1122   747]
 [  792  1124  6057  4603  2003]
 [  207   242  1558 14362 11421]
 [  127    79   295  4608 32074]]

tfidf
Tuned Classifier Parameters: {'C': 1, 'penalty': 'l1'}
Tuned Classifier Accuracy: 0.596
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 8871  1120   475   368   437]
 [ 2520  3401  2133  1200   601]
 [  879  1090  5797  5189  1624]
 [  256   222  1525 15239 10548]
 [  175    67   309  5462 31170]]

count stop
Tuned Classifier Parameters: {'C': 1, 'penalty': 'l1'}
Tuned Classifier Accuracy: 0.588
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 9133   921   396   311   510]
 [ 2193  4171  1832

In [34]:
# run tests for deep neural nets
dnn_mini_results = dnn_test_results(feature_sets)

count
Train on 70474 samples, validate on 30204 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30

tfidf
Train on 70474 samples, validate on 30204 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30

count stop
Train on 70474 samples, validate on 30204 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30

tfidf stop
Train on 70474 samples, validate on 30204 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30

count stop ngram
Train on 70474 samples, validate on 30204 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30

tfidf stop ngram
Train on 70474 samples, validate on 30204 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30

count stop ngram lda
Train on 70474 samples, validate on 30204 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30

tfidf stop ngram nmf
Train on 70474 samples, validate on 30204 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30

word to vec
Train on 70474 samples, validate on 30204 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30

word to vec 2
Train on 70474 samp

In [35]:
# run tests for sgd
sgd_mini_results = test_features(sgd_model, feature_sets)

count
Tuned Classifier Parameters: {'l1_ratio': 0.5, 'penalty': 'elasticnet'}
Tuned Classifier Accuracy: 0.550
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 7911  1718   370   621   651]
 [ 2077  4489   941  1614   734]
 [  794  2823  2975  6040  1947]
 [  251   808  1010 14984 10737]
 [  161   263   275  6936 29548]]

tfidf
Tuned Classifier Parameters: {'l1_ratio': 0.1, 'penalty': 'l2'}
Tuned Classifier Accuracy: 0.569
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 9879   342   214   260   576]
 [ 4357  2010  1159  1336   993]
 [ 1740   889  3336  5577  3037]
 [  609   222   722  9614 16623]
 [  283    84   164  2128 34524]]

count stop
Tuned Classifier Parameters: {'l1_ratio': 0.3, 'penalty': 'elasticnet'}
Tuned Classifier Accuracy: 0.559
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 7694  

In [36]:
# run tests for random forest
rf_mini_results = test_features(ranforest_model, feature_sets)

count
Tuned Classifier Parameters: {'min_samples_leaf': 5, 'n_estimators': 500}
Tuned Classifier Accuracy: 0.540
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 9210    75   286   608  1092]
 [ 2003  3549   728  1998  1577]
 [  692    34  6667  4037  3149]
 [  177     7   100 17765  9741]
 [  117     4    57  1205 35800]]

tfidf
Tuned Classifier Parameters: {'min_samples_leaf': 5, 'n_estimators': 500}
Tuned Classifier Accuracy: 0.546
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 9677    41   216   467   870]
 [ 1534  4977   501  1526  1317]
 [  560    30  8463  2898  2628]
 [  140     6    59 20259  7326]
 [   84     3    47   634 36415]]

count stop
Tuned Classifier Parameters: {'min_samples_leaf': 5, 'n_estimators': 300}
Tuned Classifier Accuracy: 0.543
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted rati

KeyboardInterrupt: 

In [37]:
# run tests for logistic regression 
logreg_mini_results = test_features(logreg_model, {'word to vec 3': w2v_mini_3})

word to vec 3
Tuned Classifier Parameters: {'C': 100, 'penalty': 'l1'}
Tuned Classifier Accuracy: 0.601
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 8866   999   520   431   455]
 [ 2981  2281  2436  1560   597]
 [  953   895  5082  6177  1472]
 [  314   212  1566 14694 11004]
 [  253    66   271  5927 30666]]

--------------------------
defaultdict(<class 'float'>, {'word to vec 3': 0.60142235642344899})


In [None]:
# run tests for XG Boost
xgb_mini_results = test_features(xgb_model, feature_sets)

count
Tuned Classifier Parameters: {'max_depth': 4, 'min_child_weight': 3}
Tuned Classifier Accuracy: 0.559
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 7495   969   573   745  1489]
 [ 2062  2769  1795  1800  1429]
 [  763   799  4843  5361  2813]
 [  287   198  1250 13492 12563]
 [  201    68   285  5617 31012]]

tfidf
Tuned Classifier Parameters: {'max_depth': 4, 'min_child_weight': 3}
Tuned Classifier Accuracy: 0.561
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 7688   990   573   784  1236]
 [ 2085  3068  1680  1784  1238]
 [  774   814  5196  5258  2537]
 [  295   185  1205 14367 11738]
 [  239    68   281  5855 30740]]

count stop
Tuned Classifier Parameters: {'max_depth': 4, 'min_child_weight': 3}
Tuned Classifier Accuracy: 0.568
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[ 7671  1

In [35]:
# run tests for random forest  
rf_mini_results = test_features(ranforest_model, {'word to vec 2': w2v_mini_2, 'word to vec 3': w2v_mini_3})

word to vec 2
Tuned Classifier Parameters: {'min_samples_leaf': 5, 'n_estimators': 500}
Tuned Classifier Accuracy: 0.534
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[10750     0    26   211   284]
 [  132  8948     0   375   400]
 [  209     0 13491   116   763]
 [  118     0     0 27184   488]
 [   81     0     2     0 37100]]

word to vec 3
Tuned Classifier Parameters: {'min_samples_leaf': 5, 'n_estimators': 500}
Tuned Classifier Accuracy: 0.545
For the confusion matrix, rows correspond to actual ratings and the columns correspond to predicted ratings.
[[10907     0    19   136   209]
 [   79  9239     0   236   301]
 [  170     0 13733    69   607]
 [  109     0     0 27298   383]
 [   63     0     3     0 37117]]

--------------------------
defaultdict(<class 'float'>, {'word to vec 2': 0.53430739585609566, 'word to vec 3': 0.5453922406086732})


In [None]:
# run tests for SVM
svm_mini_results = test_features(svm_model, feature_sets)

count
