In [1]:
import json
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

review_yelp = "yelp_dataset_2019/review.json"       # Creating paths to the data files

reviews = []
with open(review_yelp,encoding="utf8") as f:
    for line in f:
        reviews.append(json.loads(line))
             
            
            
print("There are " + str(len(reviews)) + " reviews.")

There are 6685900 reviews.


In [2]:
texts = [review['text'] for review in reviews]
stars = [review['stars'] for review in reviews]

In [4]:
print(stars[0:20])
reviews[6]

[1.0, 5.0, 5.0, 5.0, 1.0, 4.0, 3.0, 1.0, 2.0, 3.0, 4.0, 1.0, 4.0, 4.0, 1.0, 5.0, 4.0, 3.0, 5.0, 3.0]


{'review_id': 'G7XHMxG0bx9oBJNECG4IFg',
 'user_id': 'jlu4CztcSxrKx56ba1a5AQ',
 'business_id': '3fw2X5bZYeW9xCz_zGhOHg',
 'stars': 3.0,
 'useful': 5,
 'funny': 4,
 'cool': 5,
 'text': "Tracy dessert had a big name in Hong Kong and the one in First Markham place has been here for many years now! \n\nCame in for some Chinese dessert, and I must say their selection has increased tremendously over the years. I might as well add that the price has also increased tremendously as well. The waitress gave us tea, which I could taste had red date in it. Fancy!\n\nA simple taro with coconut with tapioca pearls was like $5.25 or something. Basically all the desserts were more than $5. That's crazy! I can literally just make this dessert at home and for a bowl, it would probably cost like $0.50. A few years ago, I think I can still get it for like $3-$4, which is more reasonable, but wow, more than $5 is a little over the top for this dessert. Though I must say, it is Tracy Dessert, and they are a l

In [5]:
reviews[7]

{'review_id': '8e9HxxLjjqc9ez5ezzN7iQ',
 'user_id': 'd6xvYpyzcfbF_AZ8vMB7QA',
 'business_id': 'zvO-PJCpNk4fgAVUnExYAA',
 'stars': 1.0,
 'useful': 3,
 'funny': 1,
 'cool': 1,
 'text': "This place has gone down hill.  Clearly they have cut back on staff and food quality\n\nMany of the reviews were written before the menu changed.  I've been going for years and the food quality has gone down hill.\n\nThe service is slow & my salad, which was $15, was as bad as it gets.\n\nIt's just not worth spending the money on this place when there are so many other options.",
 'date': '2010-10-05 19:12:35'}

In [3]:
keep = set([1,2,4,5])

keep_stars_is = [i for i, y in enumerate(stars) if y in keep]

In [5]:
keep_stars_is[0:20]

[0, 1, 2, 3, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16, 18, 20, 21, 22, 23]

In [7]:
stars[0:20]

[1.0,
 5.0,
 5.0,
 5.0,
 1.0,
 4.0,
 3.0,
 1.0,
 2.0,
 3.0,
 4.0,
 1.0,
 4.0,
 4.0,
 1.0,
 5.0,
 4.0,
 3.0,
 5.0,
 3.0]

In [14]:
new_stars = []
for i in range(len(keep_stars_is)):
    new_stars.append(stars[keep_stars_is[i]])

In [16]:
new_texts = []
for i in range(len(keep_stars_is)):
    new_texts.append(texts[keep_stars_is[i]])

In [21]:
len(new_stars)

5946620

In [23]:
for i in range(len(new_stars)):
    if new_stars[i] < 3:
        new_stars[i] = -1
    else:
        new_stars[i] = 1

In [26]:
def Balance_Classes(xs, ys):
    freqs = Counter(ys)
    max_allowable = freqs.most_common()[-1][1]
    num_added = {clss: 0 for clss in freqs.keys()}
    new_ys = []
    new_xs = []
    for i, y in enumerate(ys):
        if num_added[y] < max_allowable:
            new_ys.append(y)
            new_xs.append(xs[i])
            num_added[y] += 1
    return new_xs, new_ys

balanced_x, balanced_y = Balance_Classes(new_texts,new_stars)
print(Counter(balanced_y))

Counter({-1: 1544553, 1: 1544553})


In [27]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
t1 = datetime.now()
 
# the 'fit' builds up the vocabulary from all the reviews
# while the 'transform' step turns each indivdual text into
# a matrix of numbers.
vectors = vectorizer.fit_transform(balanced_x)
print(datetime.now() - t1)

1:17:40.676958


In [28]:
X_train, X_test, y_train, y_test = train_test_split(vectors, balanced_y, test_size=0.33, random_state=42)

# initialise the SVM classifier
clf = LinearSVC(verbose = 2)
 
# train the classifier
t1 = datetime.now()
clf.fit(X_train, y_train)
print(datetime.now() - t1)

[LibLinear]7:13:50.734073


In [29]:
preds = clf.predict(X_test)
print(list(preds[:10]))

print(y_test[:10])

[-1, -1, 1, -1, 1, 1, -1, -1, 1, 1]
[-1, -1, 1, -1, 1, 1, -1, -1, -1, 1]


In [30]:
print(accuracy_score(y_test, preds))

0.9726055885541075


In [31]:
print(classification_report(y_test, preds))

             precision    recall  f1-score   support

         -1       0.97      0.97      0.97    510057
          1       0.97      0.97      0.97    509348

avg / total       0.97      0.97      0.97   1019405



In [32]:
print(confusion_matrix(y_test, preds))

[[496674  13383]
 [ 14543 494805]]
