In [64]:
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.model_selection import train_test_split

In [9]:
df = pd.read_json('yelp_academic_dataset_review_50k.json', lines=True)
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,oUVfM9ua2UtJ68sHKgCvNA,-YzMXeOVQfWAVXNAtMSbyw,dnQMntrmickWGYLB30KBEQ,4,0,0,0,Coffee is VERY good. My breakfast was a welcom...,2014-07-16 13:01:33
1,E7QcmW1jmB6T3HkSMdLGDA,jLNR8Tsvi47ENvoNfVYKiQ,4GGhj7Z99E5IYWdEqOsLUQ,5,0,0,0,"I've been coming to this place for 18 years, a...",2019-04-16 20:17:17
2,GgGLzyl408biArY9oLGbRQ,392lRckiPvP-xTZ10E5RPw,c3QxX3toWdqJnKQmmIliRQ,2,0,0,0,This place is a bit overrated. It is very tren...,2021-05-02 23:53:15
3,B-EtTJZH45iCGWDNU36-1Q,OIa6ptM1qUts5arovQUAFQ,-QI8Qi8XWH3D8y8ethnajA,2,6,1,2,"This is an older airport, and it reminded me a...",2018-04-13 15:51:03
4,RJb-x897_abr1CZDYiB1Xw,fwOETgbWmBAhdO9058e4Zg,C5ZOzlslhMxRJDjBDV3KoQ,5,0,0,0,Awesome. One of my favorites. They have less ...,2016-03-27 18:29:16


In [10]:
def build_subset(df, y, classes, distributions):
    if len(classes) != len(distributions):
        raise Exception('classes and distributions must be same length')
    
    dfs = []
    for i in range(len(classes)):
        dfs.append(df.loc[df[y] == classes[i]].sample(n=distributions[i]))

    return pd.concat(dfs)

In [45]:
df_12_345 = build_subset(df, 'stars', [1, 2, 3, 4, 5], [2500, 2500, 2500, 2500, 2500]) # make a dataset with all 5 star ratings, pulling 3800 samples from each rating
df_12_45 = build_subset(df, 'stars', [1, 2, 4, 5], [2500, 2500, 2500, 2500]) # make a dataset only considering 1, 2, 4, 5 star ratings
df_1_5 = build_subset(df, 'stars', [1, 5], [4000, 4000]) # make a dataset only considering 1 and 5 star ratings

In [43]:
def ldaClassifier(df):    
    X = df[['text']]
    y = df['stars'].replace([1, 2],0).replace([3, 4, 5], 1)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1)

    tfidf_vectorizer = TfidfVectorizer()
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train.text).toarray()
    X_test_tfidf = tfidf_vectorizer.transform(X_test.text).toarray()

    lda = LinearDiscriminantAnalysis(solver='svd')
    lda.fit(X_train_tfidf, y_train)
    lda_train_accuracy = lda.score(X_train_tfidf, y_train)
    lda_test_accuracy = lda.score(X_test_tfidf, y_test)
    return (round(lda_train_accuracy, 4), round(lda_test_accuracy, 4))

In [46]:
result_12_345 = ldaClassifier(df_12_345)
print("LDA train accuracy for classifying 1 to 2 stars and 3 to 5 stars :", result_12_345[0])
print("LDA test accuracy for classifying 1 to 2 stars and 3 to 5 stars :", result_12_345[1])

LDA train accuracy for classifying 1 to 2 stars and 3 to 5 stars : 0.9865
LDA test accuracy for classifying 1 to 2 stars and 3 to 5 stars : 0.6628


In [17]:
result_12_45 = ldaClassifier(df_12_45)
print("LDA train accuracy for classifying 1 to 2 stars and 4 to 5 stars :", result_12_45[0])
print("LDA test accuracy for classifying 1 to 2 stars and 4 to 5 stars :", result_12_45[1])

LDA train accuracy for classifying 1 to 2 stars and 4 to 5 stars : 0.9818
LDA test accuracy for classifying 1 to 2 stars and 4 to 5 stars : 0.7295


In [13]:
result_1_5 = ldaClassifier(df_1_5)
print("LDA train accuracy for classifying 1 star and 5 stars :", result_1_5[0])
print("LDA test accuracy for classifying 1 star and 5 stars :", result_1_5[1])

LDA train accuracy for classifying 1 star and 5 stars : 0.9931
LDA test accuracy for classifying 1 star and 5 stars : 0.7238


In [48]:
df_12_345 = build_subset(df, 'stars', [1, 2, 3, 4, 5], [3800, 3800, 3800, 3800, 3800]) # make a dataset with all 5 star ratings, pulling 3800 samples from each rating
df_12_45 = build_subset(df, 'stars', [1, 2, 4, 5], [3800, 3800, 3800, 3800]) # make a dataset only considering 1, 2, 4, 5 star ratings
df_1_5 = build_subset(df, 'stars', [1, 5], [5500, 5500]) # make a dataset only considering 1 and 5 star ratings

In [65]:
def qdaClassifier(df):    
    X = df[['text']]
    y = df['stars'].replace([1, 2],0).replace([3, 4, 5], 1)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1)

    tfidf_vectorizer = TfidfVectorizer()
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train.text)
    X_test_tfidf = tfidf_vectorizer.transform(X_test.text)

    k_values = [10, 50, 100, 150, 200, 250]
    best_k = 0
    best_test_accuracy = -float('inf')
    best_train_accuracy = -float('inf')
    for k in k_values:
        selector = SelectKBest(score_func=f_classif, k=k)
        X_train_selector = selector.fit_transform(X_train_tfidf, y_train).toarray()
        X_test_selector = selector.transform(X_test_tfidf).toarray()

        qda = QuadraticDiscriminantAnalysis()
        qda.fit(X_train_selector, y_train)
        qda_train_accuracy = qda.score(X_train_selector, y_train)
        qda_test_accuracy = qda.score(X_test_selector, y_test)

        if qda_test_accuracy > best_test_accuracy:
            best_k = k
            best_test_accuracy = qda_test_accuracy
            best_train_accuracy = qda_train_accuracy
    return (best_k, round(best_train_accuracy, 4), round(best_test_accuracy, 4))

In [61]:
result_12_345 = qdaClassifier(df_12_345)
print("Optimal number of best features : k =", result_12_345[0])
print("QDA train accuracy for classifying 1 to 2 stars and 3 to 5 stars :", result_12_345[1])
print("QDA test accuracy for classifying 1 to 2 stars and 3 to 5 stars :", result_12_345[2])

Optimal number of best features : k = 250
QDA train accuracy for classifying 1 to 2 stars and 3 to 5 stars : 0.818
QDA test accuracy for classifying 1 to 2 stars and 3 to 5 stars : 0.7987


In [62]:
result_12_45 = qdaClassifier(df_12_45)
print("Optimal number of best features : k =", result_12_45[0])
print("QDA train accuracy for classifying 1 to 2 stars and 4 to 5 stars :", result_12_45[1])
print("QDA test accuracy for classifying 1 to 2 stars and 4 to 5 stars :", result_12_45[2])

Optimal number of best features : k = 200
QDA train accuracy for classifying 1 to 2 stars and 4 to 5 stars : 0.8646
QDA test accuracy for classifying 1 to 2 stars and 4 to 5 stars : 0.8661


In [63]:
result_1_5 = qdaClassifier(df_1_5)
print("Optimal number of best features : k =", result_1_5[0])
print("QDA train accuracy for classifying 1 star and 5 stars :", result_1_5[1])
print("QDA test accuracy for classifying 1 star and 5 stars :", result_1_5[2])

Optimal number of best features : k = 250
QDA train accuracy for classifying 1 star and 5 stars : 0.9266
QDA test accuracy for classifying 1 star and 5 stars : 0.9023
