In [173]:
import pandas as pd
import numpy as np
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [98]:
df = pd.read_json('yelp_academic_dataset_review_50k.json', lines=True)
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,oUVfM9ua2UtJ68sHKgCvNA,-YzMXeOVQfWAVXNAtMSbyw,dnQMntrmickWGYLB30KBEQ,4,0,0,0,Coffee is VERY good. My breakfast was a welcom...,2014-07-16 13:01:33
1,E7QcmW1jmB6T3HkSMdLGDA,jLNR8Tsvi47ENvoNfVYKiQ,4GGhj7Z99E5IYWdEqOsLUQ,5,0,0,0,"I've been coming to this place for 18 years, a...",2019-04-16 20:17:17
2,GgGLzyl408biArY9oLGbRQ,392lRckiPvP-xTZ10E5RPw,c3QxX3toWdqJnKQmmIliRQ,2,0,0,0,This place is a bit overrated. It is very tren...,2021-05-02 23:53:15
3,B-EtTJZH45iCGWDNU36-1Q,OIa6ptM1qUts5arovQUAFQ,-QI8Qi8XWH3D8y8ethnajA,2,6,1,2,"This is an older airport, and it reminded me a...",2018-04-13 15:51:03
4,RJb-x897_abr1CZDYiB1Xw,fwOETgbWmBAhdO9058e4Zg,C5ZOzlslhMxRJDjBDV3KoQ,5,0,0,0,Awesome. One of my favorites. They have less ...,2016-03-27 18:29:16


In [99]:
def build_subset(df, y, classes, distributions):
    if len(classes) != len(distributions):
        raise Exception('classes and distributions must be same length')
    
    dfs = []
    for i in range(len(classes)):
        dfs.append(df.loc[df[y] == classes[i]].sample(n=distributions[i]))

    return pd.concat(dfs)

In [100]:
df_12_345 = build_subset(df, 'stars', [1, 2, 3, 4, 5], [3800, 3800, 3800, 3800, 3800]) # make a dataset with all 5 star ratings, pulling 3800 samples from each rating
df_12_45 = build_subset(df, 'stars', [1, 2, 4, 5], [3800, 3800, 3800, 3800]) # make a dataset only considering 1, 2, 4, 5 star ratings
df_1_5 = build_subset(df, 'stars', [1, 5], [5500, 5500]) # make a dataset only considering 1 and 5 star ratings

In [101]:
def sklearn_log_regr(df):
    X = df[['text']]
    y = df['stars'].replace([1, 2], 0).replace([3, 4, 5], 1)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1)
    
    tfidf_vectorizer = TfidfVectorizer()
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train.text)
    X_test_tfidf = tfidf_vectorizer.transform(X_test.text)
    log_regr = LogisticRegression()
    log_regr.fit(X_train_tfidf, y_train)
    train_accuracy = log_regr.score(X_train_tfidf, y_train)
    test_accuracy = log_regr.score(X_test_tfidf, y_test)
    return round(train_accuracy, 4), round(test_accuracy, 4)

In [103]:
result_12_345 = sklearn_log_regr(df_12_345)
print("Logistic regression train accuracy for classifying 1 to 2 stars and 3 to 5 stars :", result_12_345[0])
print("Logistic regression test accuracy for classifying 1 to 2 stars and 3 to 5 stars :", result_12_345[1])

Logistic regression train accuracy for classifying 1 to 2 stars and 3 to 5 stars : 0.9008
Logistic regression test accuracy for classifying 1 to 2 stars and 3 to 5 stars : 0.8539


In [104]:
result_12_45 = sklearn_log_regr(df_12_45)
print("Logistic regression train accuracy for classifying 1 to 2 stars and 4 to 5 stars :", result_12_45[0])
print("Logistic regression test accuracy for classifying 1 to 2 stars and 4 to 5 stars :", result_12_45[1])

Logistic regression train accuracy for classifying 1 to 2 stars and 4 to 5 stars : 0.9496
Logistic regression test accuracy for classifying 1 to 2 stars and 4 to 5 stars : 0.9227


In [105]:
result_1_5 = sklearn_log_regr(df_1_5)
print("Logistic regression train accuracy for classifying 1 star and 5 stars :", result_1_5[0])
print("Logistic regression test accuracy for classifying 1 star and 5 stars :", result_1_5[1])

Logistic regression train accuracy for classifying 1 star and 5 stars : 0.9782
Logistic regression test accuracy for classifying 1 star and 5 stars : 0.9659


In [179]:
# Based on https://towardsdatascience.com/sentiment-analysis-using-logistic-regression-and-naive-bayes-16b806eb4c4b
nltk.download('stopwords')

def build_freqs(reviews, stars):
    stopwords_english = stopwords.words('english') 
    stars_list = np.squeeze(stars).tolist()
    
    freqs = {}
    for star, review in zip(stars_list, reviews.text):
        for word in review.split():
             if word not in stopwords_english:
                pair = (word, star)
                freqs[pair] = freqs.get(pair, 0) + 1

    return freqs

def sigmoid(z): 
    h = 1/(1 + np.exp(-np.clip(z, -500, 500)))
    return h

def gradientDescent(x, y, theta, alpha, num_iters):
    m = len(x)
  
    for i in range(num_iters):
        z = np.dot(x,theta)
        h = sigmoid(z)
        J = (-1/m)*(np.dot(y.T,np.log(h+alpha)) + np.dot((1-y).T,np.log(1-h+alpha)))
        theta = theta - (alpha/m)*np.dot(x.T, h-y)
    
    J = float(J)
    return J, theta

def extract_features(review, freqs):   
    feature = np.zeros((1, 3)) 
    feature[0,0] = 1        
    for word in review.split():
        feature[0,1] += freqs.get((word,1),0)
        feature[0,2] += freqs.get((word,0),0)

    return feature

def predict_review(review, freqs, theta):
    x = extract_features(review, freqs)
    z = np.dot(x,theta)
    y_pred = sigmoid(z)
    return y_pred

def prediction_accuracy(x, y, freqs, theta):
    y_hat = []
    
    for review in x:
        y_pred = predict_review(review, freqs, theta)
        
        if y_pred > 0.5:
            y_hat.append(1)
        else:
            y_hat.append(0)

    y_hat = np.array(y_hat)
    accuracy = np.sum((y == y_hat).astype(int))/len(x)    
    return round(accuracy, 4)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/achenji/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [192]:
def log_regr(df):
    X = df[['text']]
    y = df['stars'].replace([1, 2], 0).replace([3, 4, 5], 1)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1)

    freqs = build_freqs(X_train, y_train)

    X_train_list = X_train.text.tolist()
    y_train_list = y_train.values.reshape(-1, 1)

    X_train_features = np.zeros((len(X_train_list), 3))
    for i in range(len(X_train_list)):
        X_train_features[i, :]= extract_features(X_train_list[i], freqs)

    J, theta = gradientDescent(X_train_features, y_train_list, np.zeros((3, 1)), 1e-10, 100000)

    y_train_list = y_train.values.reshape(-1)
    X_test_list = X_test.text.tolist()
    y_test_list = y_test.values.reshape(-1)

    train_accuracy = prediction_accuracy(X_train_list, y_train_list, freqs, theta)
    test_accuracy = prediction_accuracy(X_test_list, y_test_list, freqs, theta)
    return train_accuracy, test_accuracy

In [191]:
result_12_345 = log_regr(df_12_345)
print("Logistic regression train accuracy for classifying 1 to 2 stars and 3 to 5 stars :", result_12_345[0])
print("Logistic regression test accuracy for classifying 1 to 2 stars and 3 to 5 stars :", result_12_345[1])

Logistic regression train accuracy for classifying 1 to 2 stars and 3 to 5 stars : 0.7099
Logistic regression test accuracy for classifying 1 to 2 stars and 3 to 5 stars : 0.7045


In [193]:
result_12_45 = log_regr(df_12_45)
print("Logistic regression train accuracy for classifying 1 to 2 stars and 4 to 5 stars :", result_12_45[0])
print("Logistic regression test accuracy for classifying 1 to 2 stars and 4 to 5 stars :", result_12_45[1])

Logistic regression train accuracy for classifying 1 to 2 stars and 4 to 5 stars : 0.6882
Logistic regression test accuracy for classifying 1 to 2 stars and 4 to 5 stars : 0.6891


In [194]:
result_1_5 = log_regr(df_1_5)
print("Logistic regression train accuracy for classifying 1 star and 5 stars :", result_1_5[0])
print("Logistic regression test accuracy for classifying 1 star and 5 stars :", result_1_5[1])

Logistic regression train accuracy for classifying 1 star and 5 stars : 0.751
Logistic regression test accuracy for classifying 1 star and 5 stars : 0.7391
