# Things to do:
* read documentation about what each parameter means in both models, also figure out which model
* consider that many of these choices will likely be very different for large amount of data
* figure out what convergence warning is
* read slides and book and determine reasonable hyperparams to run tests
* read piazza commentary
* throw on AWS cloud
* Swap to unigram and check for random state issue
* use grid for parameter selection

# Submission

1. Names of the two types of classifiers you opt to learn.
* Neural Networks, Random Forest
2. Proper citations for any external code you use. See https://integrity.mit.edu/handbook/writing-code for guidelines.
* Cite Ski-kit documentation
3. Description of your training methodology, with enough details so that another machine learning enthusiast can reproduce the your results.
* Write about how you chose different hyperparameters, maybe make them adapt to the previous one??
4. The final hyperparameter settings you use.
* Neural Networks:
    * Number of layers:
    * Width of layer:
    * Model:
    * Other?
* Random Forest:
    * Number of trees:
    * Other?
5. Training error rates, hold-out or cross-validation error rates, and test error rates for your two final classifiers.
* Neural Networks:
    * Training error rates
    * Hold out rates
    * Test error rates
* Random Forest:
    * Training error rates
    * Hold out rates
    * Test error rates
No need to submit any code.


In [1]:
from __future__ import division
from scipy.io import loadmat
import numpy as np
import scipy as sci
from scipy import stats
from scipy.sparse import csc_matrix
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from scipy.sparse import vstack
import time
import pandas as pd
import sklearn as sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [2]:
start_time = time.time()
#read, split and select amount of data to work with
samplerows = 50000 #100000 #1000000
testrows =10000 #50000 #320122
validations = 5
data = pd.read_csv('reviews_tr.csv', nrows=samplerows, iterator=True)
tdata = pd.read_csv('reviews_te.csv', nrows=testrows, iterator=True)

#split data into text and labels
dlabels = np.array(data['label'])
dtext = data['text']
tdlabels = np.array(tdata['label'])
tdtext = tdata['text']

In [3]:
#using unigram representation
#unigram represetation
vectorizer = CountVectorizer(min_df=1)

uni_dtext = vectorizer.fit_transform(dtext)
dictuni = vectorizer.get_feature_names()

#unigram representation
uni_dtext = vectorizer.fit_transform(dtext)
dictuni = vectorizer.get_feature_names()
vectorizerT = CountVectorizer(min_df=1, vocabulary=dictuni)
uni_tdtext = vectorizerT.fit_transform(tdtext)

In [4]:
datatotrain = uni_dtext
labelstotrain = dlabels
datatotest = uni_tdtext
labelstotest = tdlabels

#Random  forest
clf = RandomForestClassifier( criterion='gini', max_depth=None, \
                                 min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, \
                                 max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, \
                                 bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, \
                                 warm_start=False, class_weight=None)

param_grid = [
  {'n_estimators': [500,550,600]}
   ]


grid_search = GridSearchCV(clf, param_grid=param_grid, cv=validations)

grid_search.fit(datatotrain, labelstotrain)


print(grid_search.best_params_)

#print(grid_search.cv_results_['split0_train_score'], grid_search.cv_results_['split1_train_score'], \
      #grid_search.cv_results_['split2_train_score'], grid_search.cv_results_['split3_train_score'], \
      #grid_search.cv_results_['split4_train_score'])
print(grid_search.cv_results_['split0_test_score'], grid_search.cv_results_['split1_test_score'], \
      grid_search.cv_results_['split2_test_score'], grid_search.cv_results_['split3_test_score'], \
      grid_search.cv_results_['split4_test_score'])


clf = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'], criterion='gini', max_depth=None, \
                                 min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, \
                                 max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, \
                                 bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, \
                                 warm_start=False, class_weight=None)
clf.fit(datatotrain, labelstotrain)


#check training error
preds = clf.predict(datatotrain)
error = (np.count_nonzero(np.abs(preds-labelstotrain)))/labelstotrain.shape[0]
print("training:", error)

#run test error
preds = clf.predict(datatotest)
#check error 
error = (np.count_nonzero(np.abs(preds-labelstotest)))/labelstotest.shape[0]
print("test:", error)

print("--- %s seconds ---" % (time.time() - start_time))

{'n_estimators': 550}
(array([ 0.8240176 ,  0.8250175 ,  0.82691731]), array([ 0.8295,  0.83  ,  0.8284]), array([ 0.8329,  0.8325,  0.8321]), array([ 0.8299,  0.8326,  0.8318]), array([ 0.82888289,  0.83168317,  0.83048305]))
('training:', 2e-05)
('test:', 0.1635)
--- 14890.3754358 seconds ---


In [6]:
print(grid_search.cv_results_)

{'rank_test_score': array([3, 2, 1], dtype=int32), 'split4_test_score': array([ 0.86168617,  0.87728773,  0.87388739]), 'std_test_score': array([ 0.01299976,  0.00538412,  0.00073667]), 'param_hidden_layer_sizes': masked_array(data = [(10, 100) (20, 100) (100, 100)],
             mask = [False False False],
       fill_value = ?)
, 'split0_test_score': array([ 0.87381262,  0.86181382,  0.87491251]), 'mean_test_score': array([ 0.86372,  0.87178,  0.87388]), 'params': ({'solver': 'lbgfs', 'hidden_layer_sizes': (10, 100)}, {'solver': 'lbgfs', 'hidden_layer_sizes': (20, 100)}, {'solver': 'lbgfs', 'hidden_layer_sizes': (100, 100)}), 'split2_test_score': array([ 0.8751,  0.8741,  0.8736]), 'split3_test_score': array([ 0.8395,  0.8709,  0.8743]), 'param_solver': masked_array(data = ['lbgfs' 'lbgfs' 'lbgfs'],
             mask = [False False False],
       fill_value = ?)
, 'split1_test_score': array([ 0.8685,  0.8748,  0.8727])}


In [5]:
datatotrain = uni_dtext
labelstotrain = dlabels
datatotest = uni_tdtext
labelstotest = tdlabels
nfeatures = uni_dtext.shape[1]
nclasses = 2
av = 100 #int(np.round(nfeatures/nclasses, decimals=0))

#NeuralNet
clf = MLPClassifier(activation='relu', \
                        alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, \
                        power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, \
                        warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, \
                        validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

param_grid = [
  {'hidden_layer_sizes': [(10,av),(20,av),(100,av)], 'solver': ['lbgfs']} 
    #,{'hidden_layer_sizes': [(2,50),(5,25),(10,12)], 'solver': ['adam']}
   ]


grid_search = GridSearchCV(clf, param_grid=param_grid, cv=validations)

grid_search.fit(datatotrain, labelstotrain)


print(grid_search.best_params_)

#print(grid_search.cv_results_['split0_train_score'], grid_search.cv_results_['split1_train_score'], \
      #grid_search.cv_results_['split2_train_score'], grid_search.cv_results_['split3_train_score'], \
      #grid_search.cv_results_['split4_train_score'])
print(grid_search.cv_results_['split0_test_score'], grid_search.cv_results_['split1_test_score'], \
      grid_search.cv_results_['split2_test_score'], grid_search.cv_results_['split3_test_score'], \
      grid_search.cv_results_['split4_test_score'])

clf = MLPClassifier(hidden_layer_sizes=grid_search.best_params_['hidden_layer_sizes'], solver=grid_search.best_params_['solver'], \
                        activation='relu', \
                        alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, \
                        power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, \
                        warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, \
                        validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

clf.fit(datatotrain, labelstotrain)



#check training error
preds = clf.predict(datatotrain)
error = (np.count_nonzero(np.abs(preds-labelstotrain)))/labelstotrain.shape[0]
print("training:", error)

#run test error
preds = clf.predict(datatotest)
#check error 
error = (np.count_nonzero(np.abs(preds-labelstotest)))/labelstotest.shape[0]
print("test:", error)

print("--- %s seconds ---" % (time.time() - start_time))

{'solver': 'lbgfs', 'hidden_layer_sizes': (100, 100)}
(array([ 0.87381262,  0.86181382,  0.87491251]), array([ 0.8685,  0.8748,  0.8727]), array([ 0.8751,  0.8741,  0.8736]), array([ 0.8395,  0.8709,  0.8743]), array([ 0.86168617,  0.87728773,  0.87388739]))
('training:', 0.0796)
('test:', 0.1226)
--- 17674.348228 seconds ---
