In [1]:
import json
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

review_yelp = "yelp_dataset_2019/review.json"       # Creating paths to the data files

reviews = []
with open(review_yelp,encoding="utf8") as f:
    for line in f:
        reviews.append(json.loads(line))
             
            
            
print("There are " + str(len(reviews)) + " reviews.")

There are 6685900 reviews.


In [2]:
prolific_reviewers = Counter([review['user_id'] for review in reviews]).most_common(50)

In [3]:
prolific_reviewers[-1]

('Wx7cbLDqYEL3_aVZwh82Ww', 778)

In [4]:
keep_ids = {pr[0] : 0 for pr in prolific_reviewers}
 
keep_reviews = []
for review in reviews:
    uid = review['user_id']
    if uid in keep_ids and keep_ids[uid] < 500:
        keep_reviews.append(review)
        keep_ids[uid] += 1

In [5]:
texts = [review['text'] for review in keep_reviews]
authors = [review['user_id'] for review in keep_reviews]

In [6]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
t1 = datetime.now()
vectors = vectorizer.fit_transform(texts)
print(datetime.now() - t1)

0:00:15.100800


In [7]:
X_train, X_test, y_train, y_test = train_test_split(vectors, authors, test_size=0.33, random_state=42)

clf = LinearSVC(verbose = 2)
 
# train the classifier
t1 = datetime.now()
clf.fit(X_train, y_train)
print(datetime.now() - t1)

[LibLinear]0:00:18.408000


In [8]:
preds = clf.predict(X_test)
print(list(preds[:10]))

print(y_test[:10])

['U4INQZOPSUaj8hMjLlZ3KA', 'I-4KVZ9lqHhk8469X9FvhA', 'Wx7cbLDqYEL3_aVZwh82Ww', 'O3pSxv1SyHpY4qi4Q16KzA', 'PKEzKWv_FktMm2mGPjwd0Q', 'Q9mA60HnY87C1TW5kjAZ6Q', 'Wc5L6iuvSNF5WGBlqIO8nw', 'UYcmGbelzRa0Q6JqzLoguw', 'I-4KVZ9lqHhk8469X9FvhA', 'DK57YibC5ShBmqQl97CKog']
['U4INQZOPSUaj8hMjLlZ3KA', 'I-4KVZ9lqHhk8469X9FvhA', 'Wx7cbLDqYEL3_aVZwh82Ww', 'O3pSxv1SyHpY4qi4Q16KzA', 'PKEzKWv_FktMm2mGPjwd0Q', 'Q9mA60HnY87C1TW5kjAZ6Q', 'Wc5L6iuvSNF5WGBlqIO8nw', 'UYcmGbelzRa0Q6JqzLoguw', 'pMefTWo6gMdx8WhYSA2u3w', 'DK57YibC5ShBmqQl97CKog']


In [9]:
print(accuracy_score(y_test, preds))

0.9155151515151515


In [10]:
print(classification_report(y_test, preds))

                        precision    recall  f1-score   support

0BBUmH7Krcax1RZgbH4fSA       0.89      0.87      0.88       161
3nDUQBjKyVor5wV0reJChg       0.97      0.98      0.98       151
4wp4XI9AxKNqJima-xahlg       0.90      0.91      0.91       154
62GNFh5FySkA3MbrQmnqvg       0.90      0.97      0.93       177
CxDOIDnH8gp9KXzpBHJYXw       0.98      0.96      0.97       168
DK57YibC5ShBmqQl97CKog       0.89      0.78      0.83       156
ELcQDlf69kb-ihJfxZyL0A       0.95      0.92      0.93       170
HJj82f-csBI7jjgenwqhvw       0.90      0.90      0.90       163
I-4KVZ9lqHhk8469X9FvhA       0.90      0.97      0.93       159
L8P5OWO1Jh4B2HLa1Fnbng       0.95      0.99      0.97       158
Lfv4hefW1VbvaC2gatTFWA       0.89      0.95      0.92       149
M9rRM6Eo5YbKLKMG5QiIPA       0.95      0.97      0.96       173
MMf0LhEk5tGa1LvN7zcDnA       0.95      0.95      0.95       165
N3oNEwh0qgPqPP3Em6wJXw       0.92      0.94      0.93       166
NfU0zDaTMEQ4-X9dbQWd9A       0.95      

In [11]:
print(confusion_matrix(y_test, preds))

[[140   0   0 ...   2   0   0]
 [  0 148   1 ...   0   0   0]
 [  0   0 140 ...   0   0   1]
 ...
 [  0   0   0 ... 153   0   0]
 [  0   0   0 ...   0 151   0]
 [  1   1   0 ...   0   0 166]]
