#### Data Class

In [1]:
import random

class Sentiment:#Created an enum...i.e just assigning values
    Negative = 'NEGATIVE'
    Neutral = 'NEUTRAL'
    Positive = 'POSITIVE'

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.Negative
        elif self.score == 3:
            return Sentiment.Neutral
        else:
            return Sentiment.Positive
        
class ReviewContainer:#A function( evenly_distribute ) can do this job without the class( ReviewContainer ).The class implementation makes the codes neater
    def __init__(self,reviews):
        self.reviews = reviews
    
    def evenly_distribute(self):
        negative = list(filter(lambda x : x.sentiment == Sentiment.Negative, self.reviews))
        positive = list(filter(lambda x : x.sentiment == Sentiment.Positive, self.reviews))
        #neutral = list(filter(lambda x : x.sentiment == Sentiment.Neutral, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk 
        random.shuffle(self.reviews)

#### Load File

In [2]:
import json

file_name='./ml_data/Sentimental/Books_small_10000.json'

reviews=[]

with open(file_name,encoding='utf-8-sig') as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))#Creates list of objects instead of just appending data to list. Gets easy for data handling
        

#### Prep Data

In [3]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews,test_size=0.33,random_state=42)#random_state: used as seed(first input for random number generator) for shuffling data into train and split.

train_cont= ReviewContainer(training)
test_cont= ReviewContainer(test)
train_cont.evenly_distribute()
test_cont.evenly_distribute()
print(len(train_cont.reviews))

872


In [4]:
train_x=[x.text for x in train_cont.reviews]
train_y=[x.sentiment for x in train_cont.reviews]

test_x=[x.text for x in test_cont.reviews]
test_y=[x.sentiment for x in test_cont.reviews]

#### Bags of Words Vectorization

In [5]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
#vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()

#vectorizer.fit(train_x)
#train_x_vec=vectorizer.transform(train_x)

train_x_vec=vectorizer.fit_transform(train_x) #Does the same thing as above, usually preferred.

test_x_vec=vectorizer.transform(test_x)#Not using fit as we dont want to train our test data.

print(train_x[0])
train_x_vec.toarray()



Let me preface this review by saying I have read and enjoyed every book Grisham has written. Generally he is quite entertaining and the reader usually doesn't know what's coming at the end. Sadly, this book isn't up to the same quality of his other legal fiction.He starts with the suicide of a wealthy man who has just  written (as in ink on paper) a new will. The term holographic will should not be strange to any Grisham fan, as his earlier book, The Testament, begins the same way.I will try not to give away anything we learn at the end, although I will say i figured it out very early in this slow reading book. We do learn early on that there is an earlier will, prepared by some hotshot lawyers at a big out of town law firm. It leaves his assets to his kids and grandkids and uses the full extent of legal mumbo jumbo, tax planning and other legerdemain  to minimize the estate taxes due upon one's death. All well and good--the guy was smart, made a lot of money, and didn't want the taxma

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Classification

#### Linear SVM 

In [6]:
from sklearn import svm
from sklearn.metrics import accuracy_score

clf_svm=svm.SVC(kernel='linear')

clf_svm.fit(train_x_vec,train_y)

predicted_svm = clf_svm.predict(test_x_vec)

print('Accuracy of Linear SVM is :' , accuracy_score(test_y,predicted_svm))

Accuracy of Linear SVM is : 0.8076923076923077


#### Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier as dt

clf_dec = dt()
clf_dec.fit(train_x_vec,train_y)
predicted_dec = clf_dec.predict(test_x_vec)



print('Accuracy of Decision Tree is :' , accuracy_score(test_y,predicted_dec))

Accuracy of Decision Tree is : 0.6298076923076923


#### Naive Bayes

In [8]:
from sklearn.naive_bayes import GaussianNB

clf_nb = GaussianNB()
clf_nb.fit(train_x_vec.toarray(),train_y)
predicted_nb = clf_nb.predict(test_x_vec.toarray())

print('Accuracy of Gaussian Naive Bayes is :' , accuracy_score(test_y,predicted_nb))

Accuracy of Gaussian Naive Bayes is : 0.6610576923076923


#### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression as lg
clf_log = lg()
clf_log.fit(train_x_vec,train_y)
predicted_log = clf_log.predict(test_x_vec)

print('Accuracy of Logistic Regression is :' , accuracy_score(test_y,predicted_log))

Accuracy of Logistic Regression is : 0.8028846153846154




## Evaluation

#### Accuracy Score

In [10]:
print('Accuracy of Linear SVM is :', clf_svm.score(test_x_vec,test_y))# Score() function directly predicts as well as calculates the accuracy of the test data.
print('Accuracy of Decision Tree is :', clf_dec.score(test_x_vec,test_y))
print('Accuracy of Gaussian Naive Bayes is :',clf_nb.score(test_x_vec.toarray(),test_y))
print('Accuracy of Logistic Regression is :', clf_log.score(test_x_vec,test_y))



Accuracy of Linear SVM is : 0.8076923076923077
Accuracy of Decision Tree is : 0.6298076923076923
Accuracy of Gaussian Naive Bayes is : 0.6610576923076923
Accuracy of Logistic Regression is : 0.8028846153846154


#### F1 score

In [11]:
from sklearn.metrics import f1_score

In [12]:
print('F1 score of Linear SVM is :',f1_score(test_y,predicted_svm,average=None,labels=[Sentiment.Positive,Sentiment.Negative]))
print('F1 score of Decision Tree is :',f1_score(test_y,predicted_dec,average=None,labels=[Sentiment.Positive,Sentiment.Negative]))
print('F1 score of Gaussian Naive Bayes :',f1_score(test_y,predicted_nb,average=None,labels=[Sentiment.Positive,Sentiment.Negative]))
print('F1 score of Logistic Regression is is :',f1_score(test_y,predicted_log,average=None,labels=[Sentiment.Positive,Sentiment.Negative]))

F1 score of Linear SVM is : [0.80582524 0.80952381]
F1 score of Decision Tree is : [0.62980769 0.62980769]
F1 score of Gaussian Naive Bayes : [0.65693431 0.66508314]
F1 score of Logistic Regression is is : [0.80097087 0.8047619 ]


In [28]:
#checking_set=['Great book', 'what an amazing book', 'Don\'t love it']
#check_transform= vectorizer.transform(checking_set)
checking_input = input('Enter the string :')
checking_input=[checking_input]
check_transform= vectorizer.transform(checking_input)
print(clf_svm.predict(check_transform))

Enter the string :Nice concept
['POSITIVE']


### Tuning our Model (with Grid Search)
#### Automatically chooses the best parameters for the model

In [16]:
from sklearn.model_selection import GridSearchCV

In [17]:
parameters = {'kernel':['linear','rbf'],'C':[1,4,8,16,32]}
svc= svm.SVC(gamma='auto')
clf= GridSearchCV(svc,parameters, cv=5)
clf.fit(train_x_vec,train_y)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 4, 8, 16, 32], 'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [18]:
clf.score(test_x_vec,test_y)

0.8076923076923077

### Saving the model

#### Save model

In [19]:
import pickle

In [20]:
with open('C:/Users/ridhima gandhi/Desktop/ml/practice_models/sentiment_classifier.pkl','wb') as f:
    pickle.dump(clf,f)

#### Load model

In [21]:
with open('C:/Users/ridhima gandhi/Desktop/ml/practice_models/sentiment_classifier.pkl','rb') as f:
    saved_clf = pickle.load(f)

In [22]:
line = ['Great book']
line_transform= vectorizer.transform(line)
print(saved_clf.predict(line_transform))

['POSITIVE']
