### Overview

Using a Pipeline to expand on the exercises from the sentiment-example notebook

In [None]:
reviews = ['excellent film, excellent acting, well written screenplay, coherent plot',
    'mediocre film, unconvincing acting, stilted dialog, incoherent plot']
sentiments = [1, 0]

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame({'review': reviews, 'sentiment': sentiments})

In [None]:
df

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score

In [None]:
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier())
    ])

In [None]:
text_clf.fit(df['review'], df['sentiment'])

In [None]:
text_clf['vect'].get_feature_names()

In [None]:
text_clf['tfidf'].idf_

In [None]:
feature_df = pd.DataFrame({"feature": text_clf['vect'].get_feature_names(), 
                           'importance': text_clf['clf'].feature_importances_})

In [None]:
feature_df

### Make predictions

Let's create two new reviews, positive and negative, and see how our model predicts their sentiment scores.

In [None]:
positive_review = "excellent film, acting was so so by but the plot was well thought out"

In [None]:
negative_review = "mediocre acting, everything about this was unconvincing, save your money"

In [None]:
y_pred = text_clf.predict([positive_review, negative_review])

In [None]:
y_pred

In [None]:
y_proba = text_clf.predict_proba([positive_review, negative_review])

In [None]:
y_proba

In [None]:
df_predictions = pd.DataFrame()
df_predictions['review'] = [positive_review, negative_review]
df_predictions['sentiment'] = [1, 0]
df_predictions['pred'] = y_pred
df_predictions['proba'] = [yp[1] for yp in y_proba]

In [None]:
df_predictions

In [None]:
precision, recall, fscore, train_support = score(df_predictions['sentiment'], y_pred, pos_label=1, average='binary')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==df['sentiment']).sum()/len(y_pred), 3)))