In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('data/reviews.csv', index_col='filename')

In [None]:
df.head(10)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train, df_test = train_test_split(df, test_size=0.5)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [None]:
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier())
    ])

In [None]:
text_type = 'lemmatized_text'

In [None]:
text_clf.fit(df_train[text_type], df_train['review'])

In [None]:
text_clf['vect'].get_feature_names()[100:110]

In [None]:
len(text_clf['vect'].get_feature_names())

In [None]:
text_clf['tfidf'].idf_

In [None]:
text_clf['clf']

In [None]:
text_clf['clf'].feature_importances_

In [None]:
feature_df = pd.DataFrame({"feature": text_clf['vect'].get_feature_names(), 
                           'importance': text_clf['clf'].feature_importances_})

In [None]:
features_sorted = feature_df.sort_values(by=['importance'], ascending=False)

In [None]:
pd.set_option("max_rows", None)
features_sorted.head(100)

In [None]:
# you can do this to see the word distribution vector, but with this much data, you might not want to...
#X_train = text_clf['vect'].fit_transform(df['alpha_text'])
#print(X_train[:100])

In [None]:
len(features_sorted)

In [None]:
y_pred = text_clf.predict(df_test[text_type])

In [None]:
#y_pred

In [None]:
df_test['pred'] = y_pred

In [None]:
y_proba = text_clf.predict_proba(df_test[text_type])

In [None]:
y_proba

In [None]:
pos_proba = [item[1] for item in y_proba]

In [None]:
#pos_proba

In [None]:
df_test['proba'] = pos_proba

In [None]:
df_test.head(10)

In [None]:
# try a few lines of cleaned text - what would be retained from these sentences? 
document_text = """its not especially good but it isnt especially bad either"""
#document_text = """movie was bad acting was poor script was terrible plot was trite"""
#document_text = """movie was great acting was excellent script was magnificent and wonderful plot was exciting"""

In [None]:
# let's try: https://www.rogerebert.com/reviews/marmaduke-2010
# what would be retained from this from our data cleaning pipeline? Does it make a big difference?

In [None]:
text_clf.predict_proba([document_text])

Accuracy:
How often did the predicted value match the actual value

Precision:
When you predicted a positive result, how often was the actual value positive?

Recall:
When the actual value was positive, how often did you predict a positive value?

Visual:
https://en.wikipedia.org/wiki/Precision_and_recall

In [None]:
#https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve

In [None]:
precision, recall, fscore, train_support = precision_recall_fscore_support(df_test['review'], y_pred, pos_label=1, average='binary')
print('Precision: {} / Recall: {} / Accuracy: {} / fscore: {}'.format(
    round(precision, 3), round(recall, 3), (round((y_pred==df_test['review']).sum()/len(y_pred), 3)), round(fscore, 3)))

In [None]:
# my results for multiple runs
# body_text - Precision: 0.735 / Recall: 0.742 / Accuracy: 0.745
# alpha_text - Precision: 0.788 / Recall: 0.65 / Accuracy: 0.74
# nostop_text - Precision: 0.799 / Recall: 0.704 / Accuracy: 0.76
# lemmatized_text - 0.789 / Recall: 0.65 / Accuracy: 0.734

In [None]:
# cross validation
#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html

from sklearn.model_selection import cross_val_score

In [None]:
cross_val = cross_val_score(text_clf, df_test['lemmatized_text'], y_pred, cv=5)

In [None]:
cross_val