In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_json('yelp_academic_dataset_review_50k.json', lines=True)
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,oUVfM9ua2UtJ68sHKgCvNA,-YzMXeOVQfWAVXNAtMSbyw,dnQMntrmickWGYLB30KBEQ,4,0,0,0,Coffee is VERY good. My breakfast was a welcom...,2014-07-16 13:01:33
1,E7QcmW1jmB6T3HkSMdLGDA,jLNR8Tsvi47ENvoNfVYKiQ,4GGhj7Z99E5IYWdEqOsLUQ,5,0,0,0,"I've been coming to this place for 18 years, a...",2019-04-16 20:17:17
2,GgGLzyl408biArY9oLGbRQ,392lRckiPvP-xTZ10E5RPw,c3QxX3toWdqJnKQmmIliRQ,2,0,0,0,This place is a bit overrated. It is very tren...,2021-05-02 23:53:15
3,B-EtTJZH45iCGWDNU36-1Q,OIa6ptM1qUts5arovQUAFQ,-QI8Qi8XWH3D8y8ethnajA,2,6,1,2,"This is an older airport, and it reminded me a...",2018-04-13 15:51:03
4,RJb-x897_abr1CZDYiB1Xw,fwOETgbWmBAhdO9058e4Zg,C5ZOzlslhMxRJDjBDV3KoQ,5,0,0,0,Awesome. One of my favorites. They have less ...,2016-03-27 18:29:16


In [3]:
def build_subset(df, y, classes, distributions):
    if len(classes) != len(distributions):
        raise Exception('classes and distributions must be same length')
    
    dfs = []
    for i in range(len(classes)):
        dfs.append(df.loc[df[y] == classes[i]].sample(n=distributions[i]))

    return pd.concat(dfs)

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [5]:
df_12_345 = build_subset(df, 'stars', [1, 2, 3, 4, 5], [3800, 3800, 3800, 3800, 3800]) # make a dataset with all 5 star ratings, pulling 3800 samples from each rating
df_12_45 = build_subset(df, 'stars', [1, 2, 4, 5], [3800, 3800, 3800, 3800]) # make a dataset only considering 1, 2, 4, 5 star ratings
df_1_5 = build_subset(df, 'stars', [1, 5], [5500, 5500]) # make a dataset only considering 1 and 5 star ratings

In [6]:
def sklearn_dt_gs(df):
    X = df[['text']]
    y = df['stars'].replace([1, 2], 0).replace([3, 4, 5], 1)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1)
    
    tfidf_vectorizer = TfidfVectorizer()
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train.text)
    X_test_tfidf = tfidf_vectorizer.transform(X_test.text)
    
    param_grid = {
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
    }
    
    dtc = DecisionTreeClassifier(random_state=1)
    
    gs = GridSearchCV(estimator=dtc, param_grid=param_grid, cv=5)
    
    gs.fit(X_train_tfidf, y_train)
    
    print("Best parameters: ", gs.best_params_)
    print("Best score: ", gs.best_score_)
    
    best_dtc = DecisionTreeClassifier(**gs.best_params_, random_state=1)
    
    best_dtc.fit(X_train_tfidf, y_train)
    
    train_accuracy = best_dtc.score(X_train_tfidf, y_train)
    test_accuracy = best_dtc.score(X_test_tfidf, y_test)
    
    return round(train_accuracy, 4), round(test_accuracy, 4)
        


In [7]:
result_12_345 = sklearn_dt_gs(df_12_345)
print("Decision tree classifier train accuracy for classifying 1 to 2 stars and 3 to 5 stars :", result_12_345[0])
print("Decision tree classifier test accuracy for classifying 1 to 2 stars and 3 to 5 stars :", result_12_345[1])

Best parameters:  {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5}
Best score:  0.7292105263157895
Decision tree classifier train accuracy for classifying 1 to 2 stars and 3 to 5 stars : 0.888
Decision tree classifier test accuracy for classifying 1 to 2 stars and 3 to 5 stars : 0.7113


In [8]:
result_12_45 = sklearn_dt_gs(df_12_45)
print("Decision tree classifier train accuracy for classifying 1 to 2 stars and 4 to 5 stars :", result_12_45[0])
print("Decision tree classifier test accuracy for classifying 1 to 2 stars and 4 to 5 stars :", result_12_45[1])

Best parameters:  {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best score:  0.7731907894736841
Decision tree classifier train accuracy for classifying 1 to 2 stars and 4 to 5 stars : 0.9348
Decision tree classifier test accuracy for classifying 1 to 2 stars and 4 to 5 stars : 0.7816


In [9]:
result_1_5 = sklearn_dt_gs(df_1_5)
print("Decision tree classifier train accuracy for classifying 1 star and 5 stars :", result_1_5[0])
print("Decision tree classifier test accuracy for classifying 1 star and 5 stars :", result_1_5[1])

Best parameters:  {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best score:  0.8452272727272728
Decision tree classifier train accuracy for classifying 1 star and 5 stars : 0.9532
Decision tree classifier test accuracy for classifying 1 star and 5 stars : 0.8555
