In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [13]:
df = pd.read_json('yelp_academic_dataset_review_50k.json', lines=True)
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,oUVfM9ua2UtJ68sHKgCvNA,-YzMXeOVQfWAVXNAtMSbyw,dnQMntrmickWGYLB30KBEQ,4,0,0,0,Coffee is VERY good. My breakfast was a welcom...,2014-07-16 13:01:33
1,E7QcmW1jmB6T3HkSMdLGDA,jLNR8Tsvi47ENvoNfVYKiQ,4GGhj7Z99E5IYWdEqOsLUQ,5,0,0,0,"I've been coming to this place for 18 years, a...",2019-04-16 20:17:17
2,GgGLzyl408biArY9oLGbRQ,392lRckiPvP-xTZ10E5RPw,c3QxX3toWdqJnKQmmIliRQ,2,0,0,0,This place is a bit overrated. It is very tren...,2021-05-02 23:53:15
3,B-EtTJZH45iCGWDNU36-1Q,OIa6ptM1qUts5arovQUAFQ,-QI8Qi8XWH3D8y8ethnajA,2,6,1,2,"This is an older airport, and it reminded me a...",2018-04-13 15:51:03
4,RJb-x897_abr1CZDYiB1Xw,fwOETgbWmBAhdO9058e4Zg,C5ZOzlslhMxRJDjBDV3KoQ,5,0,0,0,Awesome. One of my favorites. They have less ...,2016-03-27 18:29:16


In [14]:
def build_subset(df, y, classes, distributions):
    if len(classes) != len(distributions):
        raise Exception('classes and distributions must be same length')
    
    dfs = []
    for i in range(len(classes)):
        dfs.append(df.loc[df[y] == classes[i]].sample(n=distributions[i]))

    return pd.concat(dfs)

In [15]:
df_12_345 = build_subset(df, 'stars', [1, 2, 3, 4, 5], [3800, 3800, 3800, 3800, 3800]) # make a dataset with all 5 star ratings, pulling 3800 samples from each rating
df_12_45 = build_subset(df, 'stars', [1, 2, 4, 5], [3800, 3800, 3800, 3800]) # make a dataset only considering 1, 2, 4, 5 star ratings
df_1_5 = build_subset(df, 'stars', [1, 5], [5500, 5500]) # make a dataset only considering 1 and 5 star ratings

In [16]:
def knnClassifier(df):    
    X = df[['text']]
    y = df['stars'].replace([1, 2],0).replace([3, 4, 5], 1)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1)

    tfidf_vectorizer = TfidfVectorizer()
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train.text)
    X_test_tfidf = tfidf_vectorizer.transform(X_test.text)

    best_k = 0
    best_test_accuracy = -float('inf')
    best_train_accuracy = -float('inf')
    for k in range(1, 11):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train_tfidf, y_train)
        knn_accuracy = knn.score(X_test_tfidf, y_test)
        if (knn_accuracy > best_test_accuracy):
            best_k = k
            best_test_accuracy = knn_accuracy
            best_train_accuracy = knn.score(X_train_tfidf, y_train)
        print("Accuracy given by percentage of correct predictions for k =", k, ":", round(knn_accuracy, 4))
    
    return (best_k, round(best_train_accuracy, 4), round(best_test_accuracy, 4))


In [17]:
best_result_12_345 = knnClassifier(df_12_345)
print("\nTrain accuracy for classifying 1 to 2 stars and 3 to 5 stars with optimal k : k =", best_result_12_345[0], "accuracy =", best_result_12_345[1])
print("Test accuracy for classifying 1 to 2 stars and 3 to 5 stars with optimal k : k =", best_result_12_345[0], "accuracy =", best_result_12_345[2])

Accuracy given by percentage of correct predictions for k = 1 : 0.6411
Accuracy given by percentage of correct predictions for k = 2 : 0.6103
Accuracy given by percentage of correct predictions for k = 3 : 0.68
Accuracy given by percentage of correct predictions for k = 4 : 0.6555
Accuracy given by percentage of correct predictions for k = 5 : 0.6979
Accuracy given by percentage of correct predictions for k = 6 : 0.6826
Accuracy given by percentage of correct predictions for k = 7 : 0.7089
Accuracy given by percentage of correct predictions for k = 8 : 0.6987
Accuracy given by percentage of correct predictions for k = 9 : 0.7211
Accuracy given by percentage of correct predictions for k = 10 : 0.7076

Train accuracy for classifying 1 to 2 stars and 3 to 5 stars with optimal k : k = 9 accuracy = 0.775
Test accuracy for classifying 1 to 2 stars and 3 to 5 stars with optimal k : k = 9 accuracy = 0.7211


In [19]:
best_result_12_45 = knnClassifier(df_12_45)
print("\nTrain accuracy for classifying 1 to 2 stars and 4 to 5 stars with optimal k : k =", best_result_12_45[0], "accuracy =", best_result_12_45[1])
print("Test accuracy for classifying 1 to 2 stars and 4 to 5 stars with optimal k : k =", best_result_12_45[0], "accuracy =", best_result_12_45[2])

Accuracy given by percentage of correct predictions for k = 1 : 0.6924
Accuracy given by percentage of correct predictions for k = 2 : 0.6658
Accuracy given by percentage of correct predictions for k = 3 : 0.7316
Accuracy given by percentage of correct predictions for k = 4 : 0.7049
Accuracy given by percentage of correct predictions for k = 5 : 0.7493
Accuracy given by percentage of correct predictions for k = 6 : 0.7299
Accuracy given by percentage of correct predictions for k = 7 : 0.7602
Accuracy given by percentage of correct predictions for k = 8 : 0.7428
Accuracy given by percentage of correct predictions for k = 9 : 0.7651
Accuracy given by percentage of correct predictions for k = 10 : 0.7503

Train accuracy for classifying 1 to 2 stars and 4 to 5 stars with optimal k : k = 9 accuracy = 0.8102
Test accuracy for classifying 1 to 2 stars and 4 to 5 stars with optimal k : k = 9 accuracy = 0.7651


In [20]:
best_result_1_5 = knnClassifier(df_1_5)
print("\nTrain accuracy for classifying 1 star and 5 stars with optimal k : k =", best_result_1_5[0], "accuracy =", best_result_1_5[1])
print("Test accuracy for classifying 1 star and 5 stars with optimal k : k =", best_result_1_5[0], "accuracy =", best_result_1_5[2])

Accuracy given by percentage of correct predictions for k = 1 : 0.7745
Accuracy given by percentage of correct predictions for k = 2 : 0.7282
Accuracy given by percentage of correct predictions for k = 3 : 0.7918
Accuracy given by percentage of correct predictions for k = 4 : 0.7705
Accuracy given by percentage of correct predictions for k = 5 : 0.8032
Accuracy given by percentage of correct predictions for k = 6 : 0.7868
Accuracy given by percentage of correct predictions for k = 7 : 0.8073
Accuracy given by percentage of correct predictions for k = 8 : 0.7877
Accuracy given by percentage of correct predictions for k = 9 : 0.8036
Accuracy given by percentage of correct predictions for k = 10 : 0.7868

Train accuracy for classifying 1 star and 5 stars with optimal k : k = 7 accuracy = 0.8574
Test accuracy for classifying 1 star and 5 stars with optimal k : k = 7 accuracy = 0.8073
