In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv('data/trainReviews.tsv', sep='\t')

In [4]:
df_train.head(20)

Unnamed: 0,id,category,text
0,858,0,burnt money is the perfect festival film it...
1,1762,1,the italian hitchcock and acknowledged mas...
2,235,0,at times you d think edtv would be an entert...
3,712,0,after a marketing windup of striking visuals a...
4,1319,1,john cusack is the kind of actor who seems to ...
5,1488,1,every once in a while a movie comes along that...
6,76,0,for better or worse the appearance of basic ...
7,69,0,first rule of fight club is don t talk ab...
8,1144,1,it is hard to imagine that a movie which inclu...
9,75,0,late in down to you the lead female characte...


In [7]:
#df_train.iloc[12]['text']

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
#from sklearn.naive_bayes import MultinomialNB
#from sklearn.neural_network import MLPClassifier

In [12]:
text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier())
    ])

In [13]:
text_clf.fit(df_train['text'], df_train['category'])

Pipeline(steps=[('vect', CountVectorizer(stop_words='english')),
                ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier())])

In [28]:
text_clf['vect'].get_feature_names()

['00',
 '000',
 '007',
 '10',
 '100',
 '1000',
 '101',
 '102',
 '105',
 '108',
 '10th',
 '11',
 '110',
 '113',
 '114',
 '12',
 '122',
 '13',
 '132',
 '137',
 '138',
 '13th',
 '14',
 '143',
 '15',
 '150',
 '151',
 '1521',
 '1590',
 '16',
 '160',
 '1600',
 '1600s',
 '161',
 '165',
 '1692',
 '16mm',
 '16x9',
 '17',
 '170',
 '1799',
 '17th',
 '18',
 '180',
 '1800s',
 '1839',
 '1847',
 '1862',
 '1885',
 '1896',
 '1898',
 '18th',
 '19',
 '1900',
 '1912',
 '1914',
 '1916',
 '1928',
 '1930',
 '1930s',
 '1932',
 '1933',
 '1939',
 '1940',
 '1940s',
 '1944',
 '1945',
 '1947',
 '1948',
 '1949',
 '1950',
 '1950s',
 '1955',
 '1958',
 '1959',
 '1960',
 '1960s',
 '1962',
 '1965',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1970s',
 '1971',
 '1972',
 '1973',
 '1974',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1980s',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1990s',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',


In [29]:
text_clf['tfidf'].idf_

array([6.52146092, 3.88240359, 6.52146092, ..., 6.52146092, 5.82831374,
       6.52146092])

In [30]:
text_clf['clf']

RandomForestClassifier()

In [36]:
text_clf['clf'].feature_importances_

array([0., 0., 0., ..., 0., 0., 0.])

In [57]:
feature_df = pd.DataFrame({"feature": text_clf['vect'].get_feature_names(), 
                           'importance': text_clf['clf'].feature_importances_})

In [63]:
features_sorted = feature_df.sort_values(by=['importance'], ascending=False)

In [67]:
features_sorted.head(25)

Unnamed: 0,feature,importance
1478,bad,0.015659
21321,worst,0.006184
8315,great,0.004959
2225,boring,0.004883
11132,life,0.00445
2705,called,0.003403
13917,perfectly,0.003287
15348,reason,0.00316
18733,supposed,0.002973
13148,obvious,0.002909


In [32]:
df_test = pd.read_csv('data/testReviews.tsv', sep='\t')

In [33]:
test_predicted = text_clf.predict(df_test['text'])

In [34]:
test_predicted

array([1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1,

In [68]:
df_test['predicted'] = test_predicted

In [35]:
import numpy as np

In [24]:
np.mean(test_predicted == df_test['category'])

0.774

In [25]:
test_predicted_proba = text_clf.predict_proba(df_test['text'])

In [26]:
test_predicted_proba

array([[0.43, 0.57],
       [0.49, 0.51],
       [0.45, 0.55],
       [0.52, 0.48],
       [0.43, 0.57],
       [0.58, 0.42],
       [0.71, 0.29],
       [0.48, 0.52],
       [0.34, 0.66],
       [0.46, 0.54],
       [0.37, 0.63],
       [0.51, 0.49],
       [0.68, 0.32],
       [0.35, 0.65],
       [0.5 , 0.5 ],
       [0.44, 0.56],
       [0.48, 0.52],
       [0.46, 0.54],
       [0.54, 0.46],
       [0.54, 0.46],
       [0.41, 0.59],
       [0.34, 0.66],
       [0.57, 0.43],
       [0.52, 0.48],
       [0.6 , 0.4 ],
       [0.55, 0.45],
       [0.49, 0.51],
       [0.43, 0.57],
       [0.74, 0.26],
       [0.57, 0.43],
       [0.31, 0.69],
       [0.49, 0.51],
       [0.74, 0.26],
       [0.46, 0.54],
       [0.52, 0.48],
       [0.53, 0.47],
       [0.4 , 0.6 ],
       [0.35, 0.65],
       [0.3 , 0.7 ],
       [0.48, 0.52],
       [0.6 , 0.4 ],
       [0.38, 0.62],
       [0.47, 0.53],
       [0.53, 0.47],
       [0.47, 0.53],
       [0.59, 0.41],
       [0.4 , 0.6 ],
       [0.41,