In [4]:
# MACHINE LEARNING #
# Every machine learning task has several steps associated with it.
# The process is:
# 1. what question are we trying to answer?
# 2. find data that will help to answer the question, can build model around that data
# 3. prepare the data such as processing, filtering ...
# 4. After the data is preped, build model around the data
# 5. Once have the model, we need to evaluate how well is the model performing
# 5. Make improvement of the model
# scikit-learn helps a lot on the whole process. It packaged up the classfication algorithm,
# regression algorithm, clustering algorithm etc..

import random

# DATA CLASS #
# The enum class to make sure consistency, in case if we accidentally make typo
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"
    
    # Making this class in order to make things neater when our data gets messy
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()

    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews

    def get_text(self):
        return [x.text for x in self.reviews]

    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]

    def evenly_distribute(self):
        # Filter review list based on what is negative, filter method does not convert
        # to list automatically
        # A lambda function is a small anonymous function. A lambda
        # function can take any number of arguments, but can only have one expression.
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        # shrink the amount of positive examples to be euqal to the length of the negative
        positive_shrunk = positive[:len(negative)]
        # make the reviews only contain the amound of negative and euqally amount of shrunk positive
        self.reviews = negative + positive_shrunk
        # make sure our data is random
        random.shuffle(self.reviews)

# LOAD DATA #
import json

file_name = 'Books_small_10000.json'

# Put the information we need together
reviews = []
# Open the file of the file name we targeted
with open(file_name) as f:
    # get the line in file
    for line in f:
        # To load the json file
        review = json.loads(line)
        # Append together the information we need to reviews list
        reviews.append(Review(review['reviewText'], review['overall']))

# Get the index 5th review's score
print(reviews[5].score)
# Get the index 5th review's text
print(reviews[5].text)
# Get the index 5th review's sentiment
print(reviews[5].sentiment)

5.0
I hoped for Mia to have some peace in this book, but her story is so real and raw.  Broken World was so touching and emotional because you go from Mia's trauma to her trying to cope.  I love the way the story displays how there is no "just bouncing back" from being sexually assaulted.  Mia showed us how those demons come for you every day and how sometimes they best you. I was so in the moment with Broken World and hurt with Mia because she was surrounded by people but so alone and I understood her feelings.  I found myself wishing I could give her some of my courage and strength or even just to be there for her.  Thank you Lizzy for putting a great character's voice on a strong subject and making it so that other peoples story may be heard through Mia's.
POSITIVE


In [6]:
# ML really likes numerical data, so we need to convert the text into quantitative vector
# Bag-of-words model: break up words in text into a dictionary of words. Map the words as 1
# and 0. The words that are in the sentence are marked as 1, the words that are not in the
# sentence are marked as 0.

# When we are building ML models, we want some subset to be training data, some subset to be
# test data.


# PREP DATA #
from sklearn.model_selection import train_test_split

# Using method train test split to split the reviews into 33% of test size
# the rest as training size. The random state helps to keep the things the same
# 42 is just an arbitary number
training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

train_container.evenly_distribute()

# Split the data in training into text and sentiment
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()

# Split the data in test into text and sentiment
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

436
436


In [7]:
# BAG OF WORDS VECTORIZATION #
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Hint: the problem with countvectorizer is that it weights words equally.
# For example, "this book is great", the count vectorizer weights "this"
# and "great" euqally even tho great does not have much meaning here.
# TfidfVectorizer stands for term frequency inverse document frequency, so a term
# is important if it occurs a lot throughout a review. But a word is less important
# if it occurs in a lot of documents.
# Orignially we use countvectorizer but tfidfvectorizer is much more useful here although
# it is just better for svm in this case.

# By utilizing vectorizer, we can see our text data in matrix. The matrix that represents the text
vectorizer = TfidfVectorizer()
# fit transform is fit a new model and transform
train_x_vectors = vectorizer.fit_transform(train_x)
# Do not use fit here because we dont want to fit in a new model
test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0])

Actual 4.5 Star ReviewA Perfect Passion from The Passion Series by Piper Kay!True passion and loyalty has been smacking Damien in the face for a very long time. What he finds as passionate such as betrayal and danger could cost him his life. Dax the hot pool boy is apparently straight and this beautiful guy comes with a lot of baggage and I mean a lot! These guys have to soon realize that taking a risk can be so rewarding; spiritually and emotionally!Damien and Dax characters were so well written that I so enjoyed reading them! The author's details about these men is so hot and steamy, I was panting through each page and wanting more and more. Damien's ex-boyfriend, Aaron is not the nicest guy and comes back into his life to create some major havoc. Will Damien find true love? Will Aaron cause too much damage?How this brilliant author writes about the incredibly hot sex scenes was well written and yet there was a great story to accompany the steamy scenes. Here knowledge of the gay man

In [10]:
# CLASSIFICATION #

# Linear SVM #
from sklearn import svm

# Get linear classifier
clf_svm =svm.SVC(kernel='linear')

# Pass in x and y to fit this classifier to our data
clf_svm.fit(train_x_vectors, train_y)

# Check if the text is semtimentally correct before we apply predict for accurancy
print(test_x[0])

# The classifer predicts the text's atittude
print(clf_svm.predict(test_x_vectors[0]))


# Decision tree #
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

# The classifer predicts the text's atittude
print(clf_dec.predict(test_x_vectors[0]))

# Naive Bayes #
#from sklearn.naive_bayes import GaussianNB

#clf_niv = GaussianNB()
#clf_niv.fit(train_x_vectors, train_y)

#print(clf_niv.predict(test_x_vectors[0]))

# Logistic Regression #
# etc #

J.J. Knight has nailed it again! A HOT MMA Fighter and a young woman looking to survive! Jo has nothing, been on her own since she was seventeen and now falling for Colt! But there are complications! Another women seems to have a hold on him, yet he is drawn to Jo. He wants Jo to trust him but she's scared and doesn't know how to. Can't wait to see how this all comes together. You rock me J.J. Knight!!!!!
['POSITIVE']
['NEGATIVE']


In [11]:
# EVALUATION #

# score method returns the mean accuracy on the given test data and labels
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
# Hint: we not only care about the mean accuracy but also the f1 score
# F1 Scores: The F1 score can be interpreted as a weighted average of the precision and recall,
# where an F1 score reaches its best value at 1 and worst score at 0.
from sklearn.metrics import f1_score
# Pass in y true and y predict, F1 score for three sentiment labels
print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,
                                                                              Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,
                                                                              Sentiment.NEGATIVE]))

# Hint: since the different models all perform really bad on neutral and negative, so it is not
# a model issue, it is more a data issue. At this moment, the approach should be checking how the
# data is like. Is it true that there are a lot of positives? If so, it makes sense that the model
# is biased towards the positive labels
print(train_y.count(Sentiment.POSITIVE)) # 552
#print(train_y.count(Sentiment.NEUTRAL)) # 71
print(train_y.count(Sentiment.NEGATIVE)) # 47

# We need to balance the data. We get a much larger data file to increase the number
# of neutral labels and negative labels
# Creat a reviewcontainer class above to make things neat ^

# Aftering distributing the data more evenly, the score of our algorithm increased because
# the prediction is more accurate.

# Drive scores even higher

0.8076923076923077
0.6418269230769231
[0.80582524 0.80952381]
[0.63390663 0.64941176]
436
436


In [14]:
#  GRID SEARCH #
# tune the hyperparameters using gridsearchcv #
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear', 'rbf'), 'C': (1,4,8,16,32)}

# When we use this grid search.
svc = svm.SVC()
# cv value is how many time to split the data up to cross validate
clf = GridSearchCV(svc, parameters, cv=5)
print(clf.fit(train_x_vectors, train_y))
print(clf.score(test_x_vectors, test_y))


# By using Grid Search, the model's accuracy which is the score has improved to 0.819711...



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
0.8076923076923077


In [17]:
# SAVING MODEL #
import pickle

with open('sentiment_classifier.pkl', 'wb') as f:
    # taking our classifiers that we were using, we are dumping f into all the parameters in clf,
    # into the file
    pickle.dump(clf, f)


# LOAD MODEL #
with open('sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

print(loaded_clf.predict(test_x_vectors[0]))

# If we are training the models, we want to be able to save , to be able to use them in
# production.

['POSITIVE']
