In [1]:
import json
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

review_yelp = "yelp_dataset_2019/review.json"       # Creating paths to the data files

reviews = []
with open(review_yelp,encoding="utf8") as f:
    for line in f:
        reviews.append(json.loads(line))
             
            
            
print("There are " + str(len(reviews)) + " reviews.")

There are 6685900 reviews.


In [2]:
prolific_reviewers = Counter([review['user_id'] for review in reviews]).most_common(50)

keep_ids = {pr[0] : 0 for pr in prolific_reviewers}
 
keep_reviews = []
for review in reviews:
    uid = review['user_id']
    if uid in keep_ids and keep_ids[uid] < 500:
        keep_reviews.append(review)
        keep_ids[uid] += 1
        
texts = [review['text'] for review in keep_reviews]
authors = [review['user_id'] for review in keep_reviews]

In [3]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
t1 = datetime.now()
vectors = vectorizer.fit_transform(texts)
print(datetime.now() - t1)

0:00:14.001861


# Naive Bayes

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

In [37]:
X_train, X_test, y_train, y_test = train_test_split(vectors, authors, test_size=0.33, random_state=42)

mnb.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [38]:
mnb_preds = mnb.predict(X_test)

In [39]:
print(accuracy_score(y_test, mnb_preds))

0.5111515151515151


In [24]:
mnb_preds[0:50]

array(['U4INQZOPSUaj8hMjLlZ3KA', 'I-4KVZ9lqHhk8469X9FvhA',
       'WeVkkF5L39888IPPlRhNpg', 'n86B7IkbU20AkxlFX_5aew',
       'PKEzKWv_FktMm2mGPjwd0Q', 'fiGqQ7pIGKyZ9G0RqWLMpg',
       'WeVkkF5L39888IPPlRhNpg', 'V-BbqKqO8anwplGRx9Q5aQ',
       'n86B7IkbU20AkxlFX_5aew', 'n86B7IkbU20AkxlFX_5aew',
       'WeVkkF5L39888IPPlRhNpg', 'n86B7IkbU20AkxlFX_5aew',
       'fiGqQ7pIGKyZ9G0RqWLMpg', 'V-BbqKqO8anwplGRx9Q5aQ',
       'WeVkkF5L39888IPPlRhNpg', 'n86B7IkbU20AkxlFX_5aew',
       'WeVkkF5L39888IPPlRhNpg', 'NfU0zDaTMEQ4-X9dbQWd9A',
       'n86B7IkbU20AkxlFX_5aew', 'WeVkkF5L39888IPPlRhNpg',
       'V-BbqKqO8anwplGRx9Q5aQ', 'V-BbqKqO8anwplGRx9Q5aQ',
       '62GNFh5FySkA3MbrQmnqvg', 'RBZ_kMjowV0t6_nv2UKaDQ',
       'fiGqQ7pIGKyZ9G0RqWLMpg', 'NfU0zDaTMEQ4-X9dbQWd9A',
       'n86B7IkbU20AkxlFX_5aew', 'WeVkkF5L39888IPPlRhNpg',
       'fiGqQ7pIGKyZ9G0RqWLMpg', 'M9rRM6Eo5YbKLKMG5QiIPA',
       'RBZ_kMjowV0t6_nv2UKaDQ', 'WeVkkF5L39888IPPlRhNpg',
       'I-4KVZ9lqHhk8469X9FvhA', 'rCWrxuRC8_pfagpchtHp6A

In [25]:
print(classification_report(y_test, mnb_preds))

                        precision    recall  f1-score   support

0BBUmH7Krcax1RZgbH4fSA       0.57      0.56      0.56       161
3nDUQBjKyVor5wV0reJChg       0.97      0.48      0.65       151
4wp4XI9AxKNqJima-xahlg       0.87      0.44      0.58       154
62GNFh5FySkA3MbrQmnqvg       0.72      0.68      0.70       177
CxDOIDnH8gp9KXzpBHJYXw       0.90      0.05      0.10       168
DK57YibC5ShBmqQl97CKog       0.86      0.08      0.14       156
ELcQDlf69kb-ihJfxZyL0A       0.00      0.00      0.00       170
HJj82f-csBI7jjgenwqhvw       0.75      0.02      0.04       163
I-4KVZ9lqHhk8469X9FvhA       0.57      0.96      0.71       159
L8P5OWO1Jh4B2HLa1Fnbng       1.00      0.05      0.10       158
Lfv4hefW1VbvaC2gatTFWA       1.00      0.01      0.03       149
M9rRM6Eo5YbKLKMG5QiIPA       0.30      0.99      0.47       173
MMf0LhEk5tGa1LvN7zcDnA       0.91      0.24      0.38       165
N3oNEwh0qgPqPP3Em6wJXw       0.86      0.04      0.07       166
NfU0zDaTMEQ4-X9dbQWd9A       0.60      

  'precision', 'predicted', average, warn_for)


In [None]:
print(confusion_matrix(y_test,mnb_preds))

# KNN

In [4]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=15)  

In [5]:
X_train, X_test, y_train, y_test = train_test_split(vectors, authors, test_size=0.33, random_state=42)

In [6]:
knn.fit(X_train, y_train)  

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=15, p=2,
           weights='uniform')

In [7]:
knn_preds = knn.predict(X_test)

In [8]:
print(accuracy_score(y_test, knn_preds))

0.4050909090909091


In [32]:
print(classification_report(y_test, knn_preds))

                        precision    recall  f1-score   support

0BBUmH7Krcax1RZgbH4fSA       0.20      0.57      0.30       161
3nDUQBjKyVor5wV0reJChg       0.70      0.65      0.67       151
4wp4XI9AxKNqJima-xahlg       0.15      0.13      0.14       154
62GNFh5FySkA3MbrQmnqvg       0.26      0.79      0.39       177
CxDOIDnH8gp9KXzpBHJYXw       0.49      0.36      0.41       168
DK57YibC5ShBmqQl97CKog       0.23      0.32      0.27       156
ELcQDlf69kb-ihJfxZyL0A       0.60      0.18      0.27       170
HJj82f-csBI7jjgenwqhvw       0.69      0.36      0.48       163
I-4KVZ9lqHhk8469X9FvhA       0.15      0.87      0.26       159
L8P5OWO1Jh4B2HLa1Fnbng       0.90      0.68      0.78       158
Lfv4hefW1VbvaC2gatTFWA       0.62      0.46      0.53       149
M9rRM6Eo5YbKLKMG5QiIPA       0.57      0.55      0.56       173
MMf0LhEk5tGa1LvN7zcDnA       0.46      0.56      0.51       165
N3oNEwh0qgPqPP3Em6wJXw       0.53      0.14      0.23       166
NfU0zDaTMEQ4-X9dbQWd9A       0.42      