In [107]:
class Sentiment:
    NEGATIVE = "NEGATIVE"
    POSITIVE = "POSITIVE"
    NEUTRAL = "NEUTRAL"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()

    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE

In [108]:
import json

file_name = 'Books_small.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

reviews[5].text

'Love the book, great story line, keeps you entertained.for a first novel from this author she did a great job,  Would definitely recommend!'

Prep Data

In [109]:
len(reviews)

1000

In [110]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

In [111]:
len(training)

670

In [112]:
print(training[0].sentiment + ": " + training[0].text)

POSITIVE: Vivid characters and descriptions. The author has created a tale that grabs your attention and I couldn't put it down.


In [113]:
train_x = [x.text for x in training] 
train_y = [x.sentiment for x in training]

test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]

train_y[0]
train_x[0]


"Vivid characters and descriptions. The author has created a tale that grabs your attention and I couldn't put it down."

In [114]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

print(train_x_vectors[0].toarray())


[[0 0 0 ... 0 0 0]]


Classification

In [115]:
# linear SVM

from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)


In [116]:
clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [117]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

clf_dectree = DecisionTreeClassifier()
clf_dectree.fit(train_x_vectors, train_y)

In [118]:
clf_dectree.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [119]:
# naive base

from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.toarray(), train_y)

In [120]:
clf_gnb.predict(test_x_vectors[0].toarray())

array(['POSITIVE'], dtype='<U8')

In [121]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [122]:
clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

evaluate

In [123]:
#mean accurancy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dectree.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

0.8242424242424242
0.7484848484848485
0.8303030303030303


In [124]:
#f1 score
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])

array([0.91370558, 0.12244898, 0.1       ])