In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.svm import SVC
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from sklearn.grid_search import GridSearchCV

In [4]:
df = pd.read_csv('../../NYT_Reviews_Cleaned.csv')

In [5]:
# update publication date to datetime and set as index
df['publication_date'] = pd.to_datetime(df['publication_date'])
df.set_index('publication_date', inplace=True)

In [6]:
X = df['full_review_text']
y = df['critics_pick']

In [7]:
# check class balance
y.value_counts()/y.count()

0.0    0.794909
1.0    0.205091
Name: critics_pick, dtype: float64

In [9]:
stemmer = PorterStemmer()

def tokenize(s):
    words = word_tokenize(s)
    word_stems = []
    for word in words:
        word_stems.append(stemmer.stem(word))
    return word_stems

In [92]:
pipe = make_pipeline(
    CountVectorizer(strip_accents = 'ascii', analyzer = tokenize, stop_words = 'english', ngram_range=(1,2)), 
    MultinomialNB(alpha = 0.4)
)

In [89]:
param_grid = 
grid = GridSearchCV(pipe, param_grid, cv=3, scoring='roc_auc', verbose=1, n_jobs=-1)

In [90]:
grid.fit(X, y)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 14.1min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer=<function tokenize at 0x1d4f0b9b0>, binary=False,
        decode_error=u'strict', dtype=<type 'numpy.int64'>,
        encoding=u'utf-8', input=u'content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 2), preproce...      use_idf=True)), ('multinomialnb', MultinomialNB(alpha=0.4, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'tfidftransformer__norm': [None, 'l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=1)

In [91]:
grid.best_score_

0.58867814450238987

In [64]:
grid.best_params_

{'multinomialnb__alpha': 0.4}

In [93]:
cross_val_score(pipe, X, y, cv=5, verbose = 1, scoring='roc_auc').mean()

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  7.8min finished


0.69538049265793123

### Try KNN

In [98]:
pipe2 = make_pipeline(
    CountVectorizer(strip_accents = 'ascii', analyzer = tokenize, stop_words = 'english'), 
    KNeighborsClassifier(n_neighbors=30)
)

In [95]:
param_grid = dict(tfidftransformer__norm = [None, 'l1', 'l2'])
grid2 = GridSearchCV(pipe2, param_grid, cv=3, scoring='roc_auc', verbose=1, n_jobs=-1)

In [96]:
grid2.fit(X,y)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 14.6min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer=<function tokenize at 0x1d4f0b9b0>, binary=False,
        decode_error=u'strict', dtype=<type 'numpy.int64'>,
        encoding=u'utf-8', input=u'content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preproce...wski',
           metric_params=None, n_jobs=1, n_neighbors=30, p=2,
           weights='uniform'))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'tfidftransformer__norm': [None, 'l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=1)

In [97]:
print grid2.best_score_
print grid2.best_params_

0.629179029579
{'tfidftransformer__norm': 'l2'}


In [85]:
cross_val_score(pipe2, X, y, cv=5, verbose = 1, scoring='roc_auc').mean()

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  8.2min finished


0.63296783082807317

### Try SVM

In [10]:
pipe3 = make_pipeline(
    CountVectorizer(strip_accents = 'ascii', analyzer = tokenize, stop_words = 'english'),
    SVC(kernel = 'rbf', C = 35)
)

In [20]:
param_grid = dict(svc__C = [25,35])
grid3 = GridSearchCV(pipe3, param_grid, cv=3, scoring='roc_auc', verbose=1, n_jobs=-1)

In [21]:
grid3.fit(X,y)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  4.6min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer=<function tokenize at 0x11f5156e0>, binary=False,
        decode_error=u'strict', dtype=<type 'numpy.int64'>,
        encoding=u'utf-8', input=u'content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preproce...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=-1, param_grid={'svc__C': [25, 35]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=1)

In [22]:
print grid3.best_score_
print grid3.best_params_

0.784595092651
{'svc__C': 35}
