In [None]:
%cd drive/My\ Drive/moviereviews

/content/drive/My Drive/moviereviews


In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score

In [None]:
df = pd.DataFrame(np.load('clean_df.npy', allow_pickle=True), columns=['title', 'score', 'text'])

In [None]:
scores = df['score'].values.astype(float)
print(scores)
scores = np.round(scores/5).astype(int)
print(scores)

[5.  5.  0.5 ... 3.  3.5 3. ]
[1 1 0 ... 1 1 1]


In [None]:
text = df['text'].str.join(" ")
text.head()

0            outstanding delightfully clever movie way
1    parasite seduce subtle suggestion path power w...
2    predictable unbelievable film unaware genre be...
3    parasite shorten release despite lack pertinen...
4    listen- like rlly rad film absolute fuck -pron...
Name: text, dtype: object

In [None]:
tfidf = TfidfVectorizer().fit(text.values)
tfidf_text = tfidf.transform(text).toarray()

In [None]:
def calculate_results(X, y, classifier):
    results = {}
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=0
        )
    classifier.fit(X_train, y_train)
    results['test_score'] = classifier.score(X_test, y_test)
    results['test_preds'] = classifier.predict(X_test)
    results['model'] = classifier
    return results

In [None]:
results = {}

In [None]:
mlpc = MLPClassifier(random_state=42, verbose=True, early_stopping=True)
results['MLPC'] = calculate_results(tfidf_text, scores, mlpc)

Iteration 1, loss = 0.59409721
Validation score: 0.831715
Iteration 2, loss = 0.46205617
Validation score: 0.831715
Iteration 3, loss = 0.39292471
Validation score: 0.831715
Iteration 4, loss = 0.33599925
Validation score: 0.846278
Iteration 5, loss = 0.26252121
Validation score: 0.860841
Iteration 6, loss = 0.19822675
Validation score: 0.865696
Iteration 7, loss = 0.15266827
Validation score: 0.872168
Iteration 8, loss = 0.12111427
Validation score: 0.877023
Iteration 9, loss = 0.09810823
Validation score: 0.877023
Iteration 10, loss = 0.08171839
Validation score: 0.878641
Iteration 11, loss = 0.06909371
Validation score: 0.878641
Iteration 12, loss = 0.05938437
Validation score: 0.878641
Iteration 13, loss = 0.05150901
Validation score: 0.877023
Iteration 14, loss = 0.04527777
Validation score: 0.878641
Iteration 15, loss = 0.04009201
Validation score: 0.873786
Iteration 16, loss = 0.03588797
Validation score: 0.872168
Iteration 17, loss = 0.03240211
Validation score: 0.878641
Iterat

In [None]:
results['MLPC']

{'model': MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
               beta_2=0.999, early_stopping=True, epsilon=1e-08,
               hidden_layer_sizes=(100,), learning_rate='constant',
               learning_rate_init=0.001, max_fun=15000, max_iter=200,
               momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
               power_t=0.5, random_state=42, shuffle=True, solver='adam',
               tol=0.0001, validation_fraction=0.1, verbose=True,
               warm_start=False),
 'test_preds': array([0, 1, 1, ..., 1, 1, 1]),
 'test_score': 0.8745072273324573}

In [None]:
dtc = DecisionTreeClassifier(max_depth=3, random_state=42)
results['DTC'] = calculate_results(tfidf_text, scores, dtc)

In [None]:
rfc = RandomForestClassifier(n_estimators=200, n_jobs=-1, verbose=True)
results['RFC'] = calculate_results(tfidf_text, scores, rfc)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   52.6s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  3.8min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    1.6s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    1.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    1.7s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    1.7s finished


In [None]:
abc = AdaBoostClassifier()
results['ABC'] = calculate_results(tfidf_text, scores, abc)

In [None]:
gnb = GaussianNB()
results['GNB'] = calculate_results(tfidf_text, scores, gnb)

In [None]:
cnb = ComplementNB()
results['CNB'] = calculate_results(tfidf_text, scores, cnb)

In [None]:
qda = QuadraticDiscriminantAnalysis()
results['QDA'] = calculate_results(tfidf_text, scores, qda)



In [None]:
results

{'ABC': {'model': AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                     n_estimators=50, random_state=None),
  'test_preds': array([1, 1, 1, ..., 1, 1, 1]),
  'test_score': 0.8597240473061761},
 'CNB': {'model': ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False),
  'test_preds': array([1, 1, 1, ..., 1, 1, 1]),
  'test_score': 0.8406701708278581},
 'DTC': {'model': DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                         max_depth=3, max_features=None, max_leaf_nodes=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, presort='deprecated',
                         random_state=42, splitter='best'),
  'test_preds': array([1, 1, 1, ..., 1, 1, 1]),
  'test_score': 0.8472404730617609},
 'GNB': {'model': GaussianNB(priors=None, var_smo

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['lbfgs', 'adam'],
    'learning_rate': ['invscaling', 'constant','adaptive'],
}

In [None]:
model = MLPClassifier(verbose=True, early_stopping=True)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
gscv = GridSearchCV(model, parameter_space, verbose=3, n_jobs=-1, cv=2)
gscv.fit(tfidf_text, scores)

Fitting 2 folds for each of 48 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed: 20.4min


In [None]:
# Best paramete set
print('Best parameters found:\n', gscv.best_params_)

In [None]:


# All results
means = gscv.cv_results_['mean_test_score']
stds = gscv.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gscv.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))