In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('data/reviews.csv')

In [None]:
df.head(10)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train, df_test = train_test_split(df, test_size=0.5)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score

In [None]:
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier())
    ])

In [None]:
text_type = 'alpha_text'

In [None]:
text_clf.fit(df_train[text_type], df_train['review'])

In [None]:
text_clf['vect'].get_feature_names()[100:110]

In [None]:
len(text_clf['vect'].get_feature_names())

In [None]:
text_clf['tfidf'].idf_

In [None]:
text_clf['clf']

In [None]:
text_clf['clf'].feature_importances_

In [None]:
feature_df = pd.DataFrame({"feature": text_clf['vect'].get_feature_names(), 
                           'importance': text_clf['clf'].feature_importances_})

In [None]:
features_sorted = feature_df.sort_values(by=['importance'], ascending=False)

In [None]:
features_sorted.head(10)

In [None]:
len(features_sorted)

In [None]:
y_pred = text_clf.predict(df_test[text_type])

In [None]:
#y_pred

In [None]:
precision, recall, fscore, train_support = score(df_test['review'], y_pred, pos_label=1, average='binary')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==df_test['review']).sum()/len(y_pred), 3)))

In [None]:
# body_text - Precision: 0.735 / Recall: 0.742 / Accuracy: 0.745
# alpha_text - Precision: 0.788 / Recall: 0.65 / Accuracy: 0.74
# nostop_text - Precision: 0.799 / Recall: 0.704 / Accuracy: 0.76
# lemmatized_text - Precision: 0.81 / Recall: 0.67 / Accuracy: 0.753

In [None]:
df_test['pred'] = y_pred

In [None]:
y_proba = text_clf.predict_proba(df_test[text_type])

In [None]:
y_proba

In [None]:
pos_proba = [item[1] for item in y_proba]

In [None]:
pos_proba

In [None]:
df_test['proba'] = pos_proba

In [None]:
df_test

In [None]:
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [None]:
df_test.set_index('row_id')

In [None]:
pysqldf("SELECT " + text_type + ", review, pred, proba FROM df_test WHERE review = 0 \
        AND review != pred ORDER BY proba DESC")

In [None]:
pysqldf("SELECT * FROM df_test ORDER BY proba asc")

In [None]:
df_test.set_index("row_id")

In [None]:
df_test.loc[741]['body_text']