In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report

In [3]:
#using the preprocessed kindle review data from kaggle
data = pd.read_csv('pkr.csv')

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,rating,reviewText,summary
0,0,5,This book was the very first bookmobile book I...,50 + years ago...
1,1,1,"When I read the description for this book, I c...",Boring! Boring! Boring!
2,2,5,I just had to edit this review. This book is a...,Wiggleliscious/new toy ready/!!
3,3,5,I don't normally buy 'mystery' novels because ...,Very good read.
4,4,5,"This isn't the kind of book I normally read, a...",Great Story!


In [6]:
#dropping more columns since they won't be aused
data=data.drop(columns=['summary','Unnamed: 0'])

In [7]:
#replacing rating values where 1-3 equal neg (negative)
#and 4-5 equal pos (positive)
data.rating = data.rating.replace([1,2,3], 'neg')
data.rating = data.rating.replace([4,5],'pos')

In [8]:
#splitting my dataset

training_data, testing_data = train_test_split(data,test_size=0.5, random_state=1234 )


In [9]:
#creating vectorize min_df ignores terms with frequency lower, same for max
#sublinear scaling, use_idf inverse doc frequency
vectorizer = TfidfVectorizer(min_df =5,
                             max_df = .8,
                             sublinear_tf =True,
                             use_idf = True)

training_vectors = vectorizer.fit_transform(training_data['reviewText'])
test_vectors = vectorizer.transform(testing_data['reviewText'])

In [16]:
#Classification with SVM
classifier_linear = svm.SVC(kernel='linear')
classifier_linear.fit(training_vectors, training_data['rating'])
prediction_linear = classifier_linear.predict(test_vectors)



report = classification_report(testing_data['rating'], prediction_linear, output_dict=True)
print('positive: ', report['pos'])
print('negative: ', report['neg'])



positive:  {'precision': 0.8208073514932721, 'recall': 0.8418041063614945, 'f1-score': 0.8311731472249917, 'support': 2971}
negative:  {'precision': 0.8408398239078902, 'recall': 0.8197424892703863, 'f1-score': 0.8301571380809094, 'support': 3029}


In [14]:
#Testing on sentences
test_text = "I love this book! It is wonderful, my new favorite."
review_vector = vectorizer.transform([test_text])
print(classifier_linear.predict(review_vector))
test_text2 = "I can never find parking on St. Louis! It's so frustrating."
review_vector2 = vectorizer.transform([test_text2])
print(classifier_linear.predict(review_vector2))
test_text3 = "That was the worst movie ever! I won't watch the rest of the franchise."
review_vector3 = vectorizer.transform([test_text3])
print(classifier_linear.predict(review_vector3))
test_text4 = "It's the best!"
review_vector4 = vectorizer.transform([test_text4])
print(classifier_linear.predict(review_vector4))

['pos']
['neg']
['neg']
['pos']
