In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/reviews.csv', index_col='filename')

In [3]:
df.head(10)

Unnamed: 0_level_0,text,review,alpha_text,nostop_text,lemmatized_text
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cv439_tok-13632.txt,"the farrelly brothers' third film , after dumb...",1,the farrelly brothers third film after dumb...,farrelly brothers third film dumb dumber kingp...,farrelly brother third film dumb dumber kingpi...
cv180_tok-20034.txt,more movie views by jamey hughton at : http : ...,1,more movie views by jamey hughton at http ...,movie views jamey hughton http welcome movievi...,movie view jamey hughton http welcome movievie...
cv553_tok-13743.txt,if chris farley had strapped some fake mutton-...,1,if chris farley had strapped some fake mutton ...,chris farley strapped fake mutton chop sidebur...,chris farley strapped fake mutton chop sidebur...
cv026_tok-29622.txt,"in zoolander , the world's most successful , i...",1,in zoolander the world s most successful i...,zoolander world successful influential intelle...,zoolander world successful influential intelle...
cv126_tok-13691.txt,if the current trends of hollywood filmmaking ...,1,if the current trends of hollywood filmmaking ...,current trends hollywood filmmaking continue p...,current trend hollywood filmmaking continue pr...
cv344_tok-29737.txt,"for more reviews and movie trailers , visit ht...",1,for more reviews and movie trailers visit ht...,reviews movie trailers visit http www joblo co...,review movie trailer visit http www joblo com ...
cv345_tok-9908.txt,director : george armitage cast : john cusack ...,1,director george armitage cast john cusack ...,director george armitage cast john cusack minn...,director george armitage cast john cusack minn...
cv165_tok-11425.txt,written by : peter wang and shirley sunstarrin...,1,written by peter wang and shirley sunstarrin...,written peter wang shirley sunstarring peter w...,written peter wang shirley sunstarring peter w...
cv245_tok-19462.txt,if you¹ve been paying attention to the media f...,1,if you ve been paying attention to the media f...,paying attention media frenzy blair witch proj...,paying attention medium frenzy blair witch pro...
cv156_tok-12349.txt,director : john sayles || screenplay : john sa...,1,director john sayles screenplay john sa...,director john sayles screenplay john sayles st...,director john sayles screenplay john sayles st...


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
df_train, df_test = train_test_split(df, test_size=0.5)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [7]:
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier())
    ])

In [8]:
text_type = 'lemmatized_text'

In [9]:
text_clf.fit(df_train[text_type], df_train['review'])

In [11]:
text_clf['vect'].get_feature_names_out()[100:110]

array(['accept', 'acceptable', 'acceptance', 'accepted', 'accepting',
       'acception', 'accepts', 'access', 'accessible', 'accessory'],
      dtype=object)

In [12]:
len(text_clf['vect'].get_feature_names_out())

22513

In [13]:
text_clf['tfidf'].idf_

array([5.14457669, 6.84932478, 6.84932478, ..., 6.84932478, 6.84932478,
       5.75071249])

In [14]:
text_clf['clf']

In [15]:
text_clf['clf'].feature_importances_

array([2.17379867e-04, 9.13693872e-05, 0.00000000e+00, ...,
       4.95411712e-05, 0.00000000e+00, 0.00000000e+00])

In [17]:
feature_df = pd.DataFrame({"feature": text_clf['vect'].get_feature_names_out(), 
                           'importance': text_clf['clf'].feature_importances_})

In [18]:
features_sorted = feature_df.sort_values(by=['importance'], ascending=False)

In [20]:
features_sorted

Unnamed: 0,feature,importance
1358,bad,0.007734
19301,stupid,0.004430
8555,great,0.003711
21820,wasted,0.003370
15856,quite,0.003172
...,...,...
8530,grateful,0.000000
8529,grate,0.000000
8528,grasshopper,0.000000
8527,grassby,0.000000


In [21]:
pd.set_option("display.max_rows", None)
features_sorted.head(100)

Unnamed: 0,feature,importance
1358,bad,0.007734
19301,stupid,0.00443
8555,great,0.003711
21820,wasted,0.00337
15856,quite,0.003172
21927,well,0.003053
553,always,0.003022
17532,screenplay,0.002995
12319,material,0.002792
1176,attempt,0.002756


In [22]:
# you can do this to see the word distribution vector, but with this much data, you might not want to...
#X_train = text_clf['vect'].fit_transform(df['alpha_text'])
#print(X_train[:100])

In [23]:
len(features_sorted)

22513

In [24]:
y_pred = text_clf.predict(df_test[text_type])

In [25]:
#y_pred

In [26]:
df_test['pred'] = y_pred

In [27]:
y_proba = text_clf.predict_proba(df_test[text_type])

In [28]:
y_proba

array([[0.39, 0.61],
       [0.56, 0.44],
       [0.44, 0.56],
       ...,
       [0.43, 0.57],
       [0.51, 0.49],
       [0.59, 0.41]])

In [29]:
pos_proba = [item[1] for item in y_proba]

In [30]:
#pos_proba

In [31]:
df_test['proba'] = pos_proba

In [32]:
df_test.head(10)

Unnamed: 0_level_0,text,review,alpha_text,nostop_text,lemmatized_text,pred,proba
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
cv333_tok-29224.txt,"rat race ( 2001 ) . starring rowan atkinson , ...",1,rat race starring rowan atkinson ...,rat race starring rowan atkinson lanai chapman...,rat race starring rowan atkinson lanai chapman...,1,0.61
cv157_tok-29786.txt,the uncompromising nudity bared throughout pet...,1,the uncompromising nudity bared throughout pet...,uncompromising nudity bared throughout petrice...,uncompromising nudity bared throughout petrice...,0,0.44
cv682_tok-16618.txt,directed by neil jordan . cast : eomann owens ...,0,directed by neil jordan cast eomann owens ...,directed neil jordan cast eomann owens stephen...,directed neil jordan cast eomann owen stephen ...,1,0.56
cv041_tok-17672.txt,"release date : april 9 , 1999 starring : katie...",0,release date april starring katie...,release date april starring katie holmes sarah...,release date april starring katie holmes sarah...,0,0.5
cv389_tok-8969.txt,film review ( c ) 1997 by kevin pattersonconta...,1,film review c by kevin pattersonconta...,film review c kevin pattersoncontact pg direct...,film review c kevin pattersoncontact pg direct...,1,0.62
cv448_tok-10768.txt,seen on 31 december 1997 with tony for free at...,0,seen on december with tony for free at...,seen december tony free loews village viias mo...,seen december tony free loews village viias mo...,0,0.39
cv557_tok-4659.txt,carry on matron is the last great carry-on fil...,1,carry on matron is the last great carry on fil...,carry matron last great carry film opinion mad...,carry matron last great carry film opinion mad...,1,0.57
cv460_tok-5367.txt,"when i arrived in paris in june , 1992 , i was...",0,when i arrived in paris in june i was...,arrived paris june surprised find france plast...,arrived paris june surprised find france plast...,0,0.5
cv441_tok-9002.txt,"new line / 1 : 30 / 1997 / r ( language , viol...",0,new line r language viol...,new line r language violence cast chris tucker...,new line r language violence cast chris tucker...,1,0.57
cv697_tok-12765.txt,"united states , 1998 u . s . release date : 5/...",0,united states u s release date ...,united states u release date wide running leng...,united state u release date wide running lengt...,0,0.36


In [33]:
# try a few lines of cleaned text - what would be retained from these sentences? 
document_text = """its not especially good but it isnt especially bad either"""
#document_text = """movie was bad acting was poor script was terrible plot was trite"""
#document_text = """movie was great acting was excellent script was magnificent and wonderful plot was exciting"""

In [34]:
# let's try: https://www.rogerebert.com/reviews/marmaduke-2010
# what would be retained from this from our data cleaning pipeline? Does it make a big difference?

In [35]:
text_clf.predict_proba([document_text])

array([[0.43, 0.57]])

Accuracy:
How often did the predicted value match the actual value

Precision:
When you predicted a positive result, how often was the actual value positive?

Recall:
When the actual value was positive, how often did you predict a positive value?

Visual:
https://en.wikipedia.org/wiki/Precision_and_recall

In [36]:
#https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve

In [37]:
precision, recall, fscore, train_support = precision_recall_fscore_support(df_test['review'], y_pred, pos_label=1, average='binary')
print('Precision: {} / Recall: {} / Accuracy: {} / fscore: {}'.format(
    round(precision, 3), round(recall, 3), (round((y_pred==df_test['review']).sum()/len(y_pred), 3)), round(fscore, 3)))

Precision: 0.757 / Recall: 0.708 / Accuracy: 0.743 / fscore: 0.732


In [38]:
# my results for multiple runs
# body_text - Precision: 0.735 / Recall: 0.742 / Accuracy: 0.745
# alpha_text - Precision: 0.788 / Recall: 0.65 / Accuracy: 0.74
# nostop_text - Precision: 0.799 / Recall: 0.704 / Accuracy: 0.76
# lemmatized_text - 0.789 / Recall: 0.65 / Accuracy: 0.734

In [39]:
# cross validation
#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html

from sklearn.model_selection import cross_val_score

In [40]:
cross_val = cross_val_score(text_clf, df_test['lemmatized_text'], y_pred, cv=5)

In [41]:
cross_val

array([0.70503597, 0.62589928, 0.73381295, 0.76086957, 0.70289855])