# Imports and Set-up

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle

from google.colab import files


from sklearn.metrics import *
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier




In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
from googleapiclient.discovery import build
drive_service = build('drive', 'v3')

import io
import pickle
from googleapiclient.http import MediaIoBaseDownload

news_ids = {
    'news': '1wB5BmfXPJd7D9woTyuYz_8ETSI8csFI3',
    'news_ngrams': '1EX3om0tgqfXrZ1tuT6T4CbSjzTq8blKo',
    'news_binary': '1IlTt-_EQFmxR_zhByHtJu0JUYygCvnHh',
    'news_binary_ngrams': '1XmTGxfWCfEk4SvaXHO2QnfrFc29JI_ik'
}
movie_ids = {
    'movie': '1IQQsIjbDt8DIZcc5HEC5G0OepKvcSBUU',
    'movie_ngrams': '19Ak_ipptGS9nGV4N1eiYNwoHLHuBhIg1',
    'movie_binary': '1mpktzleFgMS8OkrbsvhRR7Yn8aCIvPg7',
    'movie_binary_ngrams': '1FiyS0yPgrGh4Awn9ncuMpR9U1sArMoza'
}

news_labels = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

movie_labels = ['negative', 'positive']


def load_file_by_id(file_id):
  request = drive_service.files().get_media(fileId=file_id)
  downloaded = io.BytesIO()
  downloader = MediaIoBaseDownload(downloaded, request)
  done = False
  while done is False:
      # _ is a placeholder for a progress object that we ignore.
      # (Our file is small, so we skip reporting progress.)
      _, done = downloader.next_chunk()

  downloaded.seek(0)
  return downloaded

def shuffle(X, y):
  p = np.random.permutation(X.shape[0])
  return X[p], y[p]


news = {'news': {}, 'news_ngrams': {}, 'news_binary': {}, 'news_binary_ngrams': {}}
movie = {'movie': {}, 'movie_ngrams': {}, 'movie_binary': {}, 'movie_binary_ngrams': {}}

for key, file_id in news_ids.items():
  X_train, y_train, X_test, y_test = pickle.load(load_file_by_id(file_id))
  news[key]['X_train'], news[key]['y_train'] = shuffle(X_train, y_train)
  news[key]['X_test'], news[key]['y_test'] = shuffle(X_test, y_test)

for key, file_id in movie_ids.items():
  X_train, y_train, X_test, y_test = pickle.load(load_file_by_id(file_id))
  movie[key]['X_train'], movie[key]['y_train'] = shuffle(X_train, y_train)
  movie[key]['X_test'], movie[key]['y_test'] = shuffle(X_test, y_test)




In [0]:
def cv_eval_save(name, grid, X_train, y_train, X_test, y_test, y_labels, avg):
  """ Run cross validation with a grid and find the train/val/test results. Saves and downloads pkl file. """

  grid.fit(X_train, y_train)
 

  train_acc = grid.cv_results_['mean_train_score'][grid.best_index_]
  train_std = grid.cv_results_['std_train_score'][grid.best_index_]
  val_acc = grid.cv_results_['mean_test_score'][grid.best_index_]
  val_std = grid.cv_results_['std_test_score'][grid.best_index_]


  train_results = evaluate_model(grid, X_train, y_train, y_labels, avg) # results on whole training set
  test_results = evaluate_model(grid, X_test, y_test, y_labels, avg) # results on test set

  results = {'grid': grid, 'val_acc': val_acc, 'val_std': val_std, 'train_acc': train_acc,
            'train_std': train_std, 'train_results': train_results, 'test_results': test_results}


  with open(name, 'wb') as f:
    pickle.dump(results, f) 

  return results


def evaluate_model(model, X, y, y_labels, avg):
  """ get accuracy, F1 score, and generate a confusion matrix plot """
  results = {"acc": None, "f1": None, "conf_matrix": None, "conf_matrix_plot": None}
  y_pred = model.predict(X)
  results['acc'] = accuracy_score(y, y_pred)
  results['f1'] = f1_score(y, y_pred, average=avg)
  results['conf_matrix'] = confusion_matrix(y, y_pred)
  
  fig = plt.figure(figsize=(10, 8))
  sns.heatmap(results['conf_matrix'], annot=True, cmap="Blues", fmt="d")
  plt.xticks(np.arange(results['conf_matrix'].shape[0]) + 0.5, y_labels, rotation=90)
  plt.yticks(np.arange(results['conf_matrix'].shape[0]) + 0.5, y_labels, rotation=0)
  plt.xlabel("Predicted Value")
  plt.ylabel("True Value")

  results['conf_matrix_plot'] = fig

  return results

# Logistic Regression

In [0]:
from sklearn.linear_model import LogisticRegression
log_rg = LogisticRegression(penalty="elasticnet", solver="saga", max_iter=10000, n_jobs=-1) 

In [0]:
log_rg_params = { #hyperparameters over which to optimize
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'l1_ratio': [0, 0.2, 0.4, 0.6, 0.8, 1]
}

In [0]:
log_rg_grid = GridSearchCV(estimator=log_rg, param_grid=log_rg_params, n_jobs=-1,
                                cv=5, refit=True, verbose=2, return_train_score = True)

## Movie Reviews

In [0]:
movie_log_rg_results = cv_eval_save("movie_results_log_rg.pkl", log_rg_grid, movie['movie']['X_train'], movie['movie']['y_train'],
             movie['movie']['X_test'], movie['movie']['y_test'], movie_labels, 'binary')

In [0]:
files.download("movie_results_log_rg.pkl") 

In [0]:
movie_log_rg_results

In [0]:
movie_binary_log_rg_results = cv_eval_save("movie_binary_results_log_rg.pkl", log_rg_grid, movie['movie_binary']['X_train'], movie['movie_binary']['y_train'],
             movie['movie_binary']['X_test'], movie['movie_binary']['y_test'], movie_labels, 'binary')

In [0]:
files.download("movie_binary_results_log_rg.pkl")

In [0]:
movie_binary_log_rg_results

In [0]:
movie_ngrams_log_rg_results = cv_eval_save("movie_ngrams_results_log_rg.pkl", log_rg_grid, movie['movie_ngrams']['X_train'], movie['movie_ngrams']['y_train'],
             movie['movie_ngrams']['X_test'], movie['movie_ngrams']['y_test'], movie_labels, 'binary')

In [0]:
movie_ngrams_log_rg_results

In [0]:
files.download("movie_ngrams_results_log_rg.pkl")

In [0]:
movie_binary_ngrams_log_rg_results = cv_eval_save("movie_binary_ngrams_results_log_rg.pkl", log_rg_grid, movie['movie_binary_ngrams']['X_train'], movie['movie_binary_ngrams']['y_train'],
             movie['movie_binary_ngrams']['X_test'], movie['movie_binary_ngrams']['y_test'], movie_labels, 'binary')

In [0]:
movie_binary_ngrams_log_rg_results

In [0]:
files.download("movie_binary_ngrams_results_log_rg.pkl")

## News

In [0]:
news_log_rg_results = cv_eval_save("news_results_log_rg.pkl", log_rg_grid, news['news']['X_train'], news['news']['y_train'],
             news['news']['X_test'], news['news']['y_test'], news_labels, 'micro')

In [0]:
files.download("news_results_log_rg.pkl") 

In [0]:
news_log_rg_results

In [0]:
news_binary_log_rg_results = cv_eval_save("news_binary_results_log_rg.pkl", log_rg_grid, news['news_binary']['X_train'], news['news_binary']['y_train'],
             news['news_binary']['X_test'], news['news_binary']['y_test'], news_labels, 'micro')

In [0]:
files.download("news_binary_results_log_rg.pkl") 

In [0]:
news_binary_log_rg_results

In [0]:
news_ngrams_log_rg_results = cv_eval_save("news_ngrams_results_log_rg.pkl", log_rg_grid, news['news_ngrams']['X_train'], news['news_ngrams']['y_train'],
             news['news_ngrams']['X_test'], news['news_ngrams']['y_test'], news_labels, 'micro')

In [0]:
files.download("news_ngrams_results_log_rg.pkl") 

In [0]:
news_ngrams_log_rg_results

In [0]:
news_binary_ngrams_log_rg_results = cv_eval_save("news_binary_ngrams_results_log_rg.pkl", log_rg_grid, news['news_binary_ngrams']['X_train'], news['news_binary_ngrams']['y_train'],
             news['news_binary_ngrams']['X_test'], news['news_binary_ngrams']['y_test'], news_labels, 'micro')

NameError: ignored

In [0]:
files.download("news_binary_ngrams_results_log_rg.pkl") 

In [0]:
news_binary_ngrams_log_rg_results

# Decision Tree


In [0]:
dec_tree=DecisionTreeClassifier(criterion='entropy',splitter='best')

In [0]:
dec_tree_param = { #Params for gridsearch
    'min_samples_leaf': [4, 16, 64],
    'min_samples_split': [32, 128, 256],
    'max_features': [0.2, 0.3, 0.4],
    'max_depth':[1000, 10000, 1000000]
}


In [0]:
dec_tree_grid = GridSearchCV(estimator=dec_tree, param_grid=dec_tree_param, n_jobs=-1,
                                cv=5, refit=True, verbose=2, return_train_score = True) #Tune Parameters

## Movie


In [0]:
movie_dec_tree_results = cv_eval_save("movie_results_dec_tree.pkl", dec_tree_grid, movie['movie']['X_train'], movie['movie']['y_train'],
             movie['movie']['X_test'], movie['movie']['y_test'], movie_labels, 'binary')

In [0]:
files.download("movie_results_dec_tree.pkl") 

In [0]:
movie_dec_tree_results

In [0]:
movie_binary_dec_tree_results = cv_eval_save("movie_binary_results_dec_tree.pkl", dec_tree_grid, movie['movie_binary']['X_train'], movie['movie_binary']['y_train'],
             movie['movie_binary']['X_test'], movie['movie_binary']['y_test'], movie_labels, 'binary')

In [0]:
files.download("movie_binary_results_dec_tree.pkl") 

In [0]:
movie_binary_dec_tree_results

In [0]:
movie_ngrams_dec_tree_results = cv_eval_save("movie_ngrams_results_dec_tree.pkl", dec_tree_grid, movie['movie_ngrams']['X_train'], movie['movie_ngrams']['y_train'],
             movie['movie_ngrams']['X_test'], movie['movie_ngrams']['y_test'], movie_labels, 'binary')

In [0]:
files.download("movie_ngrams_results_dec_tree.pkl") 

In [0]:
movie_ngrams_dec_tree_results

In [0]:
movie_binary_ngrams_dec_tree_results = cv_eval_save("movie_binary_ngrams_results_dec_tree.pkl", dec_tree_grid, movie['movie_binary_ngrams']['X_train'], movie['movie_binary_ngrams']['y_train'],
             movie['movie_binary_ngrams']['X_test'], movie['movie_binary_ngrams']['y_test'], movie_labels, 'binary')

In [0]:
files.download("movie_binary_ngrams_results_dec_tree.pkl") 

In [0]:
movie_binary_ngrams_dec_tree_results

## News

In [0]:
news_dec_tree_results = cv_eval_save("news_results_dec_tree.pkl", dec_tree_grid, news['news']['X_train'], news['news']['y_train'],
             news['news']['X_test'], news['news']['y_test'], news_labels, 'micro')

In [0]:
files.download("news_results_dec_tree.pkl") 

In [0]:
news_dec_tree_results

In [0]:
news_binary_dec_tree_results = cv_eval_save("news_binary_results_dec_tree.pkl", dec_tree_grid, news['news_binary']['X_train'], news['news_binary']['y_train'],
             news['news_binary']['X_test'], news['news_binary']['y_test'], news_labels, 'micro')

In [0]:
files.download("news_binary_results_dec_tree.pkl") 

In [0]:
news_binary_dec_tree_results

In [0]:
news_ngrams_dec_tree_results = cv_eval_save("news_ngrams_results_dec_tree.pkl", dec_tree_grid, news['news_ngrams']['X_train'], news['news_ngrams']['y_train'],
             news['news_ngrams']['X_test'], news['news_ngrams']['y_test'], news_labels, 'micro')

In [0]:
files.download("news_ngrams_results_dec_tree.pkl") 

In [0]:
news_ngrams_dec_tree_results

In [0]:
news_binary_ngrams_dec_tree_results = cv_eval_save("news_binary_ngrams_results_dec_tree.pkl", dec_tree_grid, news['news_binary_ngrams']['X_train'], news['news_binary_ngrams']['y_train'],
             news['news_binary_ngrams']['X_test'], news['news_binary_ngrams']['y_test'], news_labels, 'micro')

In [0]:
files.download("news_binary_ngrams_results_dec_tree.pkl") 

In [0]:
news_binary_ngrams_dec_tree_results

# Support Vector Machines

In [0]:
svm = LinearSVC(dual=True, penalty = 'l2', loss='hinge', C = 1, max_iter=10000)

In [0]:
SVM_param = { #Params for gridsearch
    'C': np.logspace(-1, 1, 20),
    'penalty': ['l1', 'l2'],
    'loss': ['hinge', 'squared_hinge'],
}

In [0]:
SVM_grid = GridSearchCV(estimator=svm, param_grid=SVM_param, n_jobs=-1,
                                cv=5, refit=True, verbose=2, return_train_score = True) #Tune Parameters

## Movie

In [0]:
movie_SVM_results = cv_eval_save("movie_results_SVM.pkl", SVM_grid, movie['movie']['X_train'], movie['movie']['y_train'],
             movie['movie']['X_test'], movie['movie']['y_test'], movie_labels,'binary')

In [0]:
files.download("movie_results_SVM.pkl")

In [0]:
movie_SVM_results

In [0]:
movie_binary_SVM_results = cv_eval_save("movie_binary_results_SVM.pkl", SVM_grid, movie['movie_binary']['X_train'], movie['movie_binary']['y_train'],
             movie['movie_binary']['X_test'], movie['movie_binary']['y_test'], movie_labels,'binary')

In [0]:
files.download("movie_binary_results_SVM.pkl")

In [0]:
movie_binary_SVM_results

In [0]:
movie_ngrams_SVM_results = cv_eval_save("movie_ngrams_results_SVM.pkl", SVM_grid, movie['movie_ngrams']['X_train'], movie['movie_ngrams']['y_train'],
             movie['movie_ngrams']['X_test'], movie['movie_ngrams']['y_test'], movie_labels,'binary')

In [0]:
files.download("movie_ngrams_results_SVM.pkl")

In [0]:
movie_ngrams_SVM_results

In [0]:
movie_binary_ngrams_SVM_results = cv_eval_save("movie_binary_ngrams_results_SVM.pkl", SVM_grid, movie['movie_binary_ngrams']['X_train'], movie['movie_binary_ngrams']['y_train'],
             movie['movie_binary_ngrams']['X_test'], movie['movie_binary_ngrams']['y_test'], movie_labels,'binary')

In [0]:
files.download("movie_binary_ngrams_results_SVM.pkl")

In [0]:
movie_binary_ngrams_SVM_results

## News

In [0]:
news_SVM_results = cv_eval_save("news_results_SVM.pkl", SVM_grid, news['news']['X_train'], news['news']['y_train'],
             news['news']['X_test'], news['news']['y_test'], news_labels, 'micro')

In [0]:
files.download("news_results_SVM.pkl")

In [0]:
news_SVM_results

In [0]:
news_binary_SVM_results = cv_eval_save("news_binary_results_SVM.pkl", SVM_grid, news['news_binary']['X_train'], news['news_binary']['y_train'],
             news['news_binary']['X_test'], news['news_binary']['y_test'], news_labels, 'micro')

In [0]:
files.download("news_binary_results_SVM.pkl")

In [0]:
news_binary_SVM_results

In [0]:
news_ngrams_SVM_results = cv_eval_save("news_ngrams_results_SVM.pkl", SVM_grid, news['news_ngrams']['X_train'], news['news_ngrams']['y_train'],
             news['news_ngrams']['X_test'], news['news_ngrams']['y_test'], news_labels, 'micro')

In [0]:
files.download("news_ngrams_results_SVM.pkl")

In [0]:
news_ngrams_SVM_results

In [0]:
news_binary_ngrams_SVM_results = cv_eval_save("news_binary_ngrams_results_SVM.pkl", SVM_grid, news['news_binary_ngrams']['X_train'], news['news_binary_ngrams']['y_train'],
             news['news_binary_ngrams']['X_test'], news['news_binary_ngrams']['y_test'], news_labels, 'micro')

In [0]:
files.download("news_binary_ngrams_results_SVM.pkl")

In [0]:
news_binary_ngrams_SVM_results

# Ada Boost

In [0]:
ada_b=AdaBoostClassifier(base_estimator= None, algorithm= 'SAMME.R')

ada_param = { #Params for gridsearch
    'n_estimators': [10, 50, 1e2, 1e3, 1e5, 1e6, 1e7, 1e8],
    'learning_rate': [1e-6, 1e-5, 1e-4, 1e-3, 0.01, 0.1, 0.5, 0.1, 1, 5, 10, 100]
}

ada_grid = GridSearchCV(estimator=ada_b, param_grid=ada_param, n_jobs=-1,
                                cv=5, refit=True, verbose=2, return_train_score = True) #Tune Parameters


## Movie

In [0]:
movie_ada_results = cv_eval_save("movie_results_ada.pkl", ada_grid, movie['movie']['X_train'], movie['movie']['y_train'],
             movie['movie']['X_test'], movie['movie']['y_test'], movie_labels, 'binary')

In [0]:
files.download("movie_results_ada.pkl")

In [0]:
movie_ada_results

In [0]:
movie_binary_ada_results = cv_eval_save("movie_binary_results_ada.pkl", ada_grid, movie['movie_binary']['X_train'], movie['movie_binary']['y_train'],
             movie['movie_binary']['X_test'], movie['movie_binary']['y_test'], movie_labels, 'binary')

In [0]:
files.download("movie_binary_results_ada.pkl")

In [0]:
movie_binary_ada_results

In [0]:
movie_ngrams_ada_results = cv_eval_save("movie_ngrams_results_ada.pkl", ada_grid, movie['movie_ngrams']['X_train'], movie['movie_ngrams']['y_train'],
             movie['movie_ngrams']['X_test'], movie['movie_ngrams']['y_test'], movie_labels, 'binary')

In [0]:
files.download("movie_ngrams_results_ada.pkl")

In [0]:
movie_ngrams_ada_results

In [0]:
movie_binary_ngrams_ada_results = cv_eval_save("movie_binary_ngrams_results_ada.pkl", ada_grid, movie['movie_binary_ngrams']['X_train'], movie['movie_binary_ngrams']['y_train'],
             movie['movie_binary_ngrams']['X_test'], movie['movie_binary_ngrams']['y_test'], movie_labels, 'binary')

In [0]:
files.download("movie_binary_ngrams_results_ada.pkl")

In [0]:
movie_binary_ngrams_ada_results

## News

In [0]:
news_ada_results = cv_eval_save("news_results_ada.pkl", ada_grid, news['news']['X_train'], news['news']['y_train'],
             news['news']['X_test'], news['news']['y_test'], news_labels, 'weighted')

In [0]:
files.download("news_results_ada.pkl")

In [0]:
news_ada_results

In [0]:
news_binary_ada_results = cv_eval_save("news_binary_results_ada.pkl", ada_grid, news['news_binary']['X_train'], news['news_binary']['y_train'],
             news['news_binary']['X_test'], news['news_binary']['y_test'], news_labels, 'weighted')

In [0]:
files.download("news_binary_results_ada.pkl")

In [0]:
news_binary_ada_results

In [0]:
news_ngrams_ada_results = cv_eval_save("news_ngrams_results_ada.pkl", ada_grid, news['news_ngrams']['X_train'], news['news_ngrams']['y_train'],
             news['news_ngrams']['X_test'], news['news_ngrams']['y_test'], news_labels, 'weighted')

In [0]:
files.download("news_ngrams_results_ada.pkl")

In [0]:
news_ngrams_ada_results

In [0]:
news_binary_ngrams_ada_results = cv_eval_save("news_binary_ngrams_results_ada.pkl", ada_grid, news['news_binary_ngrams']['X_train'], news['news_binary_ngrams']['y_train'],
             news['news_binary_ngrams']['X_test'], news['news_binary_ngrams']['y_test'], news_labels, 'weighted')

In [0]:
files.download("news_binary_ngrams_results_ada.pkl")

In [0]:
news_binary_ngrams_ada_results

# Random Forest

In [0]:
rando_F=RandomForestClassifier(n_jobs=-1)


forest_param = { #Params for gridsearch
    'min_samples_leaf': [1, 3, 5, 10, 15, 20],
    'min_samples_split': [2,4,6,8,10],
    'max_features': [None, 0.2, 0.5, 0.7, 'sqrt', 'log2'],
    'max_depth':[None, 10, 20, 40, 80, 150, 300, 500, 1000],
    'n_estimators': [10, 50, 100, 500,1e3,1e5],
    'bootstrap': [True, False]
}

rando_F_grid = GridSearchCV(estimator=rando_F, param_grid=forest_param, n_jobs=-1,
                                cv=5, refit=True, verbose=2, return_train_score = True) #Tune Parameters


## Movie

In [0]:
movie_RandoF_results = cv_eval_save("movie_results_forest.pkl", rando_F_grid, movie['movie']['X_train'], movie['movie']['y_train'],
             movie['movie']['X_test'], movie['movie']['y_test'], movie_labels, 'binary')

Fitting 5 folds for each of 19440 candidates, totalling 97200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


KeyboardInterrupt: ignored

In [0]:
files.download("movie_results_forest.pkl")

In [0]:
movie_RandoF_results

In [0]:
movie_binary_RandoF_results = cv_eval_save("movie_binary_results_forest.pkl", rando_F_grid, movie['movie_binary']['X_train'], movie['movie_binary']['y_train'],
             movie['movie_binary']['X_test'], movie['movie_binary']['y_test'], movie_labels, 'binary')

In [0]:
files.download("movie_binary_results_forest.pkl")

In [0]:
movie_binary_RandoF_result

In [0]:
movie_ngrams_RandoF_results = cv_eval_save("movie_ngrams_results_forest.pkl", rando_F_grid, movie['movie_ngrams']['X_train'], movie['movie_ngrams']['y_train'],
             movie['movie_ngrams']['X_test'], movie['movie_ngrams']['y_test'], movie_labels, 'binary')

In [0]:
files.download("movie_ngrams_results_forest.pkl")

In [0]:
movie_ngrams_RandoF_result

In [0]:
movie_binary_ngrams_RandoF_results = cv_eval_save("movie_binary_ngrams_results_forest.pkl", rando_F_grid, movie['movie_binary_ngrams']['X_train'], movie['movie_binary_ngrams']['y_train'],
             movie['movie_binary_ngrams']['X_test'], movie['movie_binary_ngrams']['y_test'], movie_labels, 'binary')

In [0]:
files.download("movie_binary_ngrams_results_forest.pkl")

In [0]:
movie_binary_ngrams_RandoF_result

## News

In [0]:
news_RandoF_results = cv_eval_save("news_results_forest.pkl", rando_F_grid, news['news']['X_train'], news['news']['y_train'],
             news['news']['X_test'], news['news']['y_test'], news_labels, 'weighted')

In [0]:
files.download("news_results_forest.pkl")

In [0]:
news_RandoF_results

In [0]:
news_binary_RandoF_results = cv_eval_save("news_binary_results_forest.pkl", rando_F_grid, news['news_binary']['X_train'], news['news_binary']['y_train'],
             news['news_binary']['X_test'], news['news_binary']['y_test'], news_labels, 'weighted')

In [0]:
files.download("news_binary_results_forest.pkl")

In [0]:
news_binary_RandoF_results

In [0]:
news_ngrams_RandoF_results = cv_eval_save("news_ngrams_results_forest.pkl", rando_F_grid, news['news_ngrams']['X_train'], news['news_ngrams']['y_train'],
             news['news_ngrams']['X_test'], news['news_ngrams']['y_test'], news_labels, 'weighted')

In [0]:
files.download("news_ngrams_results_forest.pkl")

In [0]:
news_ngrams_RandoF_results

In [0]:
news_binary_ngrams_RandoF_results = cv_eval_save("news_binary_ngrams_results_forest.pkl", rando_F_grid, news['news_binary_ngrams']['X_train'], news['news_binary_ngrams']['y_train'],
             news['news_binary_ngrams']['X_test'], news['news_binary_ngrams']['y_test'], news_labels, 'weighted')

In [0]:
files.download("news_binary_ngrams_results_forest.pkl")

In [0]:
news_binary_ngrams_RandoF_results

# Naive Bayes

In [0]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB() 

nb_params = { #hyperparameters over which to optimize
    'alpha': [0, 0.001, 0.01, 0.1, 1, 10, 100],
    'fit_prior': [True, False]
}

nb_grid = GridSearchCV(estimator=nb, param_grid=nb_params, n_jobs=-1,
                                cv=5, refit=True, verbose=2, return_train_score = True)