In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('reviews.csv', index_col='filename')

In [3]:
df.head(10)

Unnamed: 0_level_0,text,review,alpha_text,nostop_text,lemmatized_text
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cv439_tok-13632.txt,"the farrelly brothers' third film , after dumb...",1,the farrelly brothers third film after dumb...,farrelly brothers third film dumb dumber kingp...,farrelly brother third film dumb dumber kingpi...
cv180_tok-20034.txt,more movie views by jamey hughton at : http : ...,1,more movie views by jamey hughton at http ...,movie views jamey hughton http welcome movievi...,movie view jamey hughton http welcome movievie...
cv553_tok-13743.txt,if chris farley had strapped some fake mutton-...,1,if chris farley had strapped some fake mutton ...,chris farley strapped fake mutton chop sidebur...,chris farley strapped fake mutton chop sidebur...
cv026_tok-29622.txt,"in zoolander , the world's most successful , i...",1,in zoolander the world s most successful i...,zoolander world successful influential intelle...,zoolander world successful influential intelle...
cv126_tok-13691.txt,if the current trends of hollywood filmmaking ...,1,if the current trends of hollywood filmmaking ...,current trends hollywood filmmaking continue p...,current trend hollywood filmmaking continue pr...
cv344_tok-29737.txt,"for more reviews and movie trailers , visit ht...",1,for more reviews and movie trailers visit ht...,reviews movie trailers visit http www joblo co...,review movie trailer visit http www joblo com ...
cv345_tok-9908.txt,director : george armitage cast : john cusack ...,1,director george armitage cast john cusack ...,director george armitage cast john cusack minn...,director george armitage cast john cusack minn...
cv165_tok-11425.txt,written by : peter wang and shirley sunstarrin...,1,written by peter wang and shirley sunstarrin...,written peter wang shirley sunstarring peter w...,written peter wang shirley sunstarring peter w...
cv245_tok-19462.txt,if you¹ve been paying attention to the media f...,1,if you ve been paying attention to the media f...,paying attention media frenzy blair witch proj...,paying attention medium frenzy blair witch pro...
cv156_tok-12349.txt,director : john sayles || screenplay : john sa...,1,director john sayles screenplay john sa...,director john sayles screenplay john sayles st...,director john sayles screenplay john sayles st...


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
df_train, df_test = train_test_split(df, test_size=0.5)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score

In [7]:
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier())
    ])

In [8]:
text_type = 'lemmatized_text'

In [9]:
text_clf.fit(df_train[text_type], df_train['review'])

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier())])

In [10]:
text_clf['vect'].get_feature_names()[100:110]

['acceptance',
 'accepted',
 'accepting',
 'acception',
 'accepts',
 'access',
 'accessibility',
 'accessible',
 'accident',
 'accidental']

In [11]:
len(text_clf['vect'].get_feature_names())

22639

In [12]:
text_clf['tfidf'].idf_

array([5.46303042, 6.84932478, 6.84932478, ..., 6.84932478, 5.75071249,
       6.84932478])

In [13]:
text_clf['clf']

RandomForestClassifier()

In [14]:
text_clf['clf'].feature_importances_

array([0., 0., 0., ..., 0., 0., 0.])

In [15]:
feature_df = pd.DataFrame({"feature": text_clf['vect'].get_feature_names(), 
                           'importance': text_clf['clf'].feature_importances_})

In [16]:
features_sorted = feature_df.sort_values(by=['importance'], ascending=False)

In [17]:
pd.set_option("max_rows", None)
features_sorted.head(100)

Unnamed: 0,feature,importance
1386,bad,0.006555
2201,boring,0.003712
14771,performance,0.003483
12174,make,0.003091
19593,supposed,0.003029
22319,wonderfully,0.002917
539,also,0.002824
12452,material,0.002811
22034,well,0.002729
21911,wasted,0.002718


In [35]:
# you can do this to see the word distribution vector, but with this much data, you might not want to...
#X_train = text_clf['vect'].fit_transform(df['alpha_text'])
#print(X_train[:100])

In [19]:
len(features_sorted)

22639

In [20]:
y_pred = text_clf.predict(df_test[text_type])

In [21]:
#y_pred

In [22]:
precision, recall, fscore, train_support = score(df_test['review'], y_pred, pos_label=1, average='binary')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==df_test['review']).sum()/len(y_pred), 3)))

Precision: 0.872 / Recall: 0.584 / Accuracy: 0.737


In [23]:
# my results for multiple runs
# body_text - Precision: 0.735 / Recall: 0.742 / Accuracy: 0.745
# alpha_text - Precision: 0.788 / Recall: 0.65 / Accuracy: 0.74
# nostop_text - Precision: 0.799 / Recall: 0.704 / Accuracy: 0.76
# lemmatized_text - Precision: 0.81 / Recall: 0.67 / Accuracy: 0.753

In [24]:
df_test['pred'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['pred'] = y_pred


In [25]:
y_proba = text_clf.predict_proba(df_test[text_type])

In [26]:
y_proba

array([[0.76, 0.24],
       [0.57, 0.43],
       [0.4 , 0.6 ],
       ...,
       [0.49, 0.51],
       [0.65, 0.35],
       [0.5 , 0.5 ]])

In [27]:
pos_proba = [item[1] for item in y_proba]

In [28]:
#pos_proba

In [29]:
df_test['proba'] = pos_proba

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['proba'] = pos_proba


In [30]:
df_test

Unnamed: 0_level_0,text,review,alpha_text,nostop_text,lemmatized_text,pred,proba
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
cv495_tok-18551.txt,"for more reviews and movie screensavers , visi...",0,for more reviews and movie screensavers visi...,reviews movie screensavers visit http www jobl...,review movie screensavers visit http www joblo...,0,0.24
cv675_tok-11864.txt,capsule : perversely entertaining hong kong ac...,1,capsule perversely entertaining hong kong ac...,capsule perversely entertaining hong kong acti...,capsule perversely entertaining hong kong acti...,0,0.43
cv506_tok-13396.txt,moon over parador ( 1988 ) 1/4 . produced and ...,1,moon over parador produced and ...,moon parador produced directed paul mazursky w...,moon parador produced directed paul mazursky w...,1,0.6
cv257_tok-10937.txt,capsule : a love story hidden inside a half-se...,1,capsule a love story hidden inside a half se...,capsule love story hidden inside half serious ...,capsule love story hidden inside half serious ...,0,0.37
cv363_tok-21128.txt,"cast : john saxon , ronee blakley , heather la...",0,cast john saxon ronee blakley heather la...,cast john saxon ronee blakley heather langenka...,cast john saxon ronee blakley heather langenka...,1,0.51
cv487_tok-11827.txt,director : harold becker screenwriter : lawren...,0,director harold becker screenwriter lawren...,director harold becker screenwriter lawrence k...,director harold becker screenwriter lawrence k...,1,0.52
cv332_tok-11084.txt,"slavery is bad . after hundreds of years , we'...",1,slavery is bad after hundreds of years we ...,slavery bad hundreds years finally figured one...,slavery bad hundred year finally figured one a...,0,0.49
cv584_tok-14395.txt,"release date : september 11 , 1998 starring : ...",1,release date september starring ...,release date september starring ian michael sm...,release date september starring ian michael sm...,1,0.56
cv448_tok-10768.txt,seen on 31 december 1997 with tony for free at...,0,seen on december with tony for free at...,seen december tony free loews village viias mo...,seen december tony free loews village viias mo...,0,0.45
cv671_tok-10077.txt,a miramax pictures release . starring robin wi...,1,a miramax pictures release starring robin wi...,miramax pictures release starring robin willia...,miramax picture release starring robin william...,1,0.59


In [31]:
# let's try: https://www.rogerebert.com/reviews/marmaduke-2010
document_text = """its not especially good but it isnt especially bad either"""
#document_text = """movie was bad acting was poor script was terrible plot was trite"""
#document_text = """movie was great acting was excellent script was magnificent and wonderful plot was exciting"""

In [32]:
text_clf.predict_proba([document_text])

array([[0.61, 0.39]])