In [10]:
import numpy as np
import pandas as pd

df = pd.read_csv('../TextFiles/moviereviews2.tsv', sep='\t')

# Drop missing values / blanks
def missing_vals(df):
    df.dropna(inplace=True)
    blanks = []
    for i,lb,rv in df.itertuples():
        if type(rv)==str:            
            if rv.isspace():         
                blanks.append(i)
    df.drop(blanks, inplace=True)

missing_vals(df)

In [16]:
from sklearn.model_selection import train_test_split
X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=24)

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# Naïve Bayes:
nb_text_classifier = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])

# Random Forest
rf_text_classifier = Pipeline([('tfidf', TfidfVectorizer()),
                     ('rf', RandomForestClassifier()),
])

In [29]:
from sklearn import metrics
def model_results(model):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(metrics.confusion_matrix(y_test,predictions))
    print(metrics.classification_report(y_test,predictions))
    print("Accuracy: ",round(metrics.accuracy_score(y_test,predictions),2))

In [30]:
model_results(nb_text_classifier)

[[935  34]
 [147 858]]
              precision    recall  f1-score   support

         neg       0.86      0.96      0.91       969
         pos       0.96      0.85      0.90      1005

    accuracy                           0.91      1974
   macro avg       0.91      0.91      0.91      1974
weighted avg       0.91      0.91      0.91      1974

Accuracy:  0.91


In [31]:
model_results(rf_text_classifier)

[[852 117]
 [138 867]]
              precision    recall  f1-score   support

         neg       0.86      0.88      0.87       969
         pos       0.88      0.86      0.87      1005

    accuracy                           0.87      1974
   macro avg       0.87      0.87      0.87      1974
weighted avg       0.87      0.87      0.87      1974

Accuracy:  0.87


In [54]:
myreview = 'I thought it was great.'
print(nb_text_classifier.predict([myreview]))

['pos']


In [55]:
myreview = 'I thought it was terrible.'
print(nb_text_classifier.predict([myreview]))

['neg']
