In [1]:
import json
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

review_yelp = "yelp_dataset_2019/review.json"       # Creating paths to the data files

reviews = []
with open(review_yelp,encoding="utf8") as f:
    for line in f:
        reviews.append(json.loads(line))
             
            
            
print("There are " + str(len(reviews)) + " reviews.")                  # Finding the size of each list

There are 6685900 reviews.


In [2]:
texts = [review['text'] for review in reviews]
stars = [review['stars'] for review in reviews]

In [3]:
Counter(stars)

Counter({1.0: 1002159, 5.0: 2933082, 4.0: 1468985, 3.0: 739280, 2.0: 542394})

In [3]:
def Balance_Classes(xs, ys):
    freqs = Counter(ys)
    max_allowable = freqs.most_common()[-1][1]
    num_added = {clss: 0 for clss in freqs.keys()}
    new_ys = []
    new_xs = []
    for i, y in enumerate(ys):
        if num_added[y] < max_allowable:
            new_ys.append(y)
            new_xs.append(xs[i])
            num_added[y] += 1
    return new_xs, new_ys

balanced_x, balanced_y = Balance_Classes(texts,stars)
print(Counter(balanced_y))

Counter({1.0: 542394, 5.0: 542394, 4.0: 542394, 3.0: 542394, 2.0: 542394})


In [4]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
t1 = datetime.now()
 
# the 'fit' builds up the vocabulary from all the reviews
# while the 'transform' step turns each indivdual text into
# a matrix of numbers.
vectors = vectorizer.fit_transform(balanced_x)
print(datetime.now() - t1)

2:46:05.313192


In [5]:
X_train, X_test, y_train, y_test = train_test_split(vectors, balanced_y, test_size=0.33, random_state=42)

# initialise the SVM classifier
clf = LinearSVC(verbose = 2)
 
# train the classifier
t1 = datetime.now()
clf.fit(X_train, y_train)
print(datetime.now() - t1)

[LibLinear]



2:27:06.144048


In [6]:
preds = clf.predict(X_test)
print(list(preds[:10]))

print(y_test[:10])

[5.0, 4.0, 4.0, 2.0, 3.0, 5.0, 2.0, 5.0, 3.0, 1.0]
[5.0, 3.0, 4.0, 2.0, 2.0, 5.0, 3.0, 4.0, 3.0, 1.0]


In [7]:
print(accuracy_score(y_test, preds))

0.630141761951213


In [8]:
print(classification_report(y_test, preds))

             precision    recall  f1-score   support

        1.0       0.73      0.79      0.76    178927
        2.0       0.56      0.53      0.54    178837
        3.0       0.54      0.52      0.53    179139
        4.0       0.57      0.54      0.56    179505
        5.0       0.73      0.77      0.75    178543

avg / total       0.63      0.63      0.63    894951



In [9]:
print(confusion_matrix(y_test, preds))

[[140815  31345   4799    960   1008]
 [ 41233  94071  37136   4832   1565]
 [  8355  36893  93973  34259   5659]
 [  1698   4749  33555  97672  41831]
 [  1266   1024   4899  33939 137415]]
