In [1]:
class Sentiment:
    POSITIVE = "POSITIVE"
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE

In [2]:
import json 

file_name = "data/sentiment/Books_small_10000.json"

reviews = []
try:
    with open(file_name) as f:
        for line in f:
            review_json = json.loads(line)
            # Assuming 'overall' key exists and it is used for the score
            # and 'reviewText' key exists for the review text
            if 'reviewText' in review_json and 'overall' in review_json:
                review = Review(review_json['reviewText'], review_json['overall'])
                reviews.append(review)
except FileNotFoundError:
    print(f"File not found: {file_name}")
except json.JSONDecodeError:
    print("Error decoding JSON")
except Exception as e:
    print(f"An error occurred: {e}")

# Assuming there are at least 6 reviews in the list to safely access index 5
if len(reviews) > 5:
    print(reviews[5].text)
else:
    print("Not enough reviews loaded.")

I hoped for Mia to have some peace in this book, but her story is so real and raw.  Broken World was so touching and emotional because you go from Mia's trauma to her trying to cope.  I love the way the story displays how there is no "just bouncing back" from being sexually assaulted.  Mia showed us how those demons come for you every day and how sometimes they best you. I was so in the moment with Broken World and hurt with Mia because she was surrounded by people but so alone and I understood her feelings.  I found myself wishing I could give her some of my courage and strength or even just to be there for her.  Thank you Lizzy for putting a great character's voice on a strong subject and making it so that other peoples story may be heard through Mia's.


In [3]:
from sklearn.model_selection import train_test_split
import collections

sentiments = [review.sentiment for review in reviews]

# Split the data, stratifying by sentiment to ensure equal distribution of sentiments in train and test sets
train, test = train_test_split(reviews, test_size=0.33, random_state=42, stratify=sentiments)

def sentiment_distribution(reviews):
    return collections.Counter([review.sentiment for review in reviews])

original_dist = sentiment_distribution(reviews)
train_dist = sentiment_distribution(train)
test_dist = sentiment_distribution(test)

print("Original Distribution:", original_dist)
print("Training Distribution:", train_dist)
print("Testing Distribution:", test_dist)

Original Distribution: Counter({'POSITIVE': 8378, 'NEUTRAL': 978, 'NEGATIVE': 644})
Training Distribution: Counter({'POSITIVE': 5613, 'NEUTRAL': 655, 'NEGATIVE': 432})
Testing Distribution: Counter({'POSITIVE': 2765, 'NEUTRAL': 323, 'NEGATIVE': 212})


In [4]:
train_x = [x.text for x in train]
train_y = [x.sentiment for x in train]

test_x  = [x.text for x in test]
text_y = [x.sentiment for x in test]

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
#vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)



In [6]:
from sklearn import svm

clf_svm = svm.SVC(kernel = 'linear')
clf_svm.fit(train_x_vectors, train_y)

print(test_x[0])
clf_svm.predict(test_x_vectors[0])

I love this series between Avery and Sean but I am getting so frustrated  I need a resolve to this story how much longer is it going as I get the other Ferro stories and don't think I can I last. Great Story just wish was in a full novel edition instead of snippets.This episode it great and once again a cliff hanger what can i say Iam hooked


array(['POSITIVE'], dtype='<U8')

In [7]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [8]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.toarray(), train_y)  # Convert sparse matrix to dense array for training
clf_gnb.predict(test_x_vectors[0].reshape(1, -1).toarray())  # Convert single test vector to dense array


array(['POSITIVE'], dtype='<U8')

In [9]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)
clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [10]:
print(clf_svm.score(test_x_vectors, text_y))
print(clf_log.score(test_x_vectors, text_y))
print(clf_dec.score(test_x_vectors, text_y))
print(clf_gnb.score(test_x_vectors.toarray(), text_y))

0.8518181818181818
0.8484848484848485
0.7748484848484849
0.6627272727272727


In [11]:
from sklearn.metrics import f1_score

f1_score(text_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.NEUTRAL, Sentiment.NEGATIVE, Sentiment.POSITIVE])
#f1_score(text_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.NEUTRAL, Sentiment.NEGATIVE, Sentiment.POSITIVE])
#f1_score(text_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.NEUTRAL, Sentiment.NEGATIVE, Sentiment.POSITIVE])

array([0.10810811, 0.30258303, 0.92297365])

In [12]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)


In [13]:
print(clf.score(test_x_vectors, text_y))


0.8518181818181818


In [14]:
f1_score(text_y, clf.predict(test_x_vectors), average=None, labels=[Sentiment.NEUTRAL, Sentiment.NEGATIVE, Sentiment.POSITIVE])


array([0.10810811, 0.30258303, 0.92297365])

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Parameter grid to search
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'], 
    'solver': ['liblinear'] 
}

# Create a LogisticRegression object
clf_log = LogisticRegression()
grid_search = GridSearchCV(clf_log, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(train_x_vectors, train_y)

best_clf_log = grid_search.best_estimator_

prediction = best_clf_log.predict(test_x_vectors[0].reshape(1, -1))

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
print("Prediction for first test example:", prediction)


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Parameters: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score: 0.8505970149253731
Prediction for first test example: ['POSITIVE']


In [16]:
import pickle

with open("./models/sentimental_headline.pkl", 'wb') as f:
    pickle.dumb(clf,f)

FileNotFoundError: [Errno 2] No such file or directory: './models/sentimental_headline.pkl'

In [None]:

with open("./models/sentimental_headline.pkl", 'wb') as f:
    loaded_model = pickle.load(f)

print(test_x[0])
loaded_model.predict(test_x_vectors[0])