In [8]:
import pandas as pd
import numpy as np
import text_normalizer as tn
import model_evaluation_utils as meu

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier

np.set_printoptions(precision=2, linewidth=80)

In [None]:
dataset = pd.read_csv("movie_reviews.csv")

print(dataset.head())
reviews = np.array(dataset["review"])
sentiments = np.array(dataset["sentiment"])

train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

norm_train_reviews = tn.normalize_corpus(train_reviews)
norm_test_reviews = tn.normalize_corpus(test_reviews)

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [None]:
# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(norm_train_reviews)
# build TFIDF
tv = TfidfVectorizer(use_idf=True, min_df=.0, max_df=1.0, ngram_range=(1,2), sublinear_tf=True)
tv_train_features = tv.fit_transform(norm_train_reviews)

#transform test reviews into features
cv_test_features = cv.transform(norm_test_reviews)
tv_test_features = tv.transform(norm_test_reviews)

print("BOW model:> Train features shape", cv_train_features.shape,
      "Test features shape:", cv_test_features.shape
     )
print("TFIDF model:> Train features shape", tv_train_features.shape,
      "Test features shape", tv_test_features.shape
     )

In [None]:
lr = LogisticRegression(penalty="l2", max_iter=100, C=1)
svm = SGDClassifier(loss="hinge")

In [None]:
# Logistic Regression Model on BOW features
lr_bow_predictions = meu.train_predict_model(classifier=lr,
                                             train_features=cv_train_features, train_labels=train_sentiments,
                                             test_features=cv_test_features, test_labels=test_sentiments
                                            )
meu.display_model_perfomance_metrics(true_labels=test_sentiments,
                                     predicted_labels=lr_bow_predictions, 
                                     classes=["positive", "negative"])

In [None]:
lr_tfidf_predictions = meu.train_predict_model(classifier=lr, 
                                               train_features=tv_train_features, train_labels=train_sentiments,
                                               test_features=tv_test_features, test_labels=test_sentiments
                                              )
meu.display_model_perfomance_metrics(true_labels=test_sentiments,
                                     predicted_labels=lr_tfidf_predictions, 
                                     classes=["positive", "negative"])