In [7]:
import random
import string
class Sentiment:
    negative = "negative"
    positive = "positive"
    neutral = "neutral"
class Review:
    def __init__(self, text, score):
        translator = str.maketrans('', '', string.punctuation)
        self.text = text.lower().translate(translator)
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.negative
        elif self.score >= 4:
            return Sentiment.positive
        else:
            return Sentiment.neutral
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.negative, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.positive, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        

In [8]:
import json

file_name = "Books_small_10000.json"

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

In [9]:
from sklearn.model_selection import train_test_split
import numpy as np
training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)
train_container.evenly_distribute()
test_container.evenly_distribute()
len(train_container.reviews)

872

In [36]:
X_train = train_container.get_text()
y_train = train_container.get_sentiment()

X_test = test_container.get_text()
y_test = test_container.get_sentiment()

y_train.count(Sentiment.positive)
# y_train.count(Sentiment.negative)

436

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
import sklearn
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer() 
# term is important if it doesn't appear as frequently
# 'great' and 'bad' have more weight than 'this' and 'was'
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

# SVM

In [39]:
from sklearn.svm import SVC

clf_svm = SVC(kernel='linear')
clf_svm.fit(X_train_vectors, y_train)
accuracy = clf_svm.score(X_test_vectors, y_test)
print(round(accuracy * 100, 2), '%')

81.49 %


# Decision Tree

In [40]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(X_train_vectors, y_train)
accuracy = clf_dec.score(X_test_vectors, y_test)
print(round(accuracy * 100, 2), '%')

65.62 %


# Naive Bayes

In [41]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
X_train_vectors_gnb = X_train_vectors.toarray()
X_test_vectors_gnb = X_test_vectors.toarray()
y_train = np.array(y_train)
clf_gnb.fit(X_train_vectors_gnb, y_train)
accuracy = clf_gnb.score(X_test_vectors_gnb, y_test)
print(round(accuracy * 100, 2), '%')

66.11 %


# Logistic Regression

In [42]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=400)
clf_log.fit(X_train_vectors, y_train)
accuracy = clf_log.score(X_test_vectors, y_test)
print(round(accuracy * 100, 2), '%')

79.57 %


# F1 Score

In [43]:
from sklearn.metrics import f1_score
f1_score(y_test, clf_svm.predict(X_test_vectors), average=None, labels=[Sentiment.positive, Sentiment.negative])
f1_score(y_test, clf_dec.predict(X_test_vectors), average=None, labels=[Sentiment.positive, Sentiment.negative])
f1_score(y_test, clf_gnb.predict(X_test_vectors_gnb), average=None, labels=[Sentiment.positive, Sentiment.negative])
f1_score(y_test, clf_log.predict(X_test_vectors), average=None, labels=[Sentiment.positive, Sentiment.negative])

array([0.79115479, 0.8       ])

# See which examples the SVM scored incorrectly

In [58]:
for i in range(20):
    if clf_svm.predict(X_test_vectors[i]) != y_test[i]:
        print('Review: ', X_test[i], '\n')
        print('Prediction', clf_svm.predict(X_test_vectors[i]), '\n')
        break

Review:  hard to put this book down i found i really cared about the characters and was drawn into the story 

Prediction ['negative'] 



# Input your own rating!

In [64]:
test_set = [input("Rate our book: ")]
new_test = vectorizer.transform(test_set)
clf_svm.predict(new_test)

Rate our book:  definitely the worst adaptation of the little red story i have ever read. the illustrations are passable, but the story was terrible. 


array(['positive'], dtype='<U8')

# Tuning our model with Grid Search

In [47]:
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear', 'rbf'), 'C':(1, 4, 16, 32)}

svc = SVC(gamma='scale')
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(X_train_vectors, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'kernel': ('linear', 'rbf'), 'C': (1, 4, 16, 32)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [48]:
accuracy = clf.score(X_test_vectors, y_test)
print(round(accuracy * 100, 2), '%')

81.49 %


# Saving Model with Pickle

In [49]:
import pickle

with open('sentiment_classifier.pkl', 'wb') as f: # wb means writing buffer (write)
    pickle.dump(clf_svm, f)
    

# Load model

In [50]:
with open('sentiment_classifier.pkl', 'rb') as f: # read file
    loaded_clf = pickle.load(f)

In [51]:
loaded_clf.predict(X_test_vectors[0])
print(y_test[0])

positive
