In [53]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [54]:
df = pd.read_json('yelp_academic_dataset_review_50k.json', lines=True)
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,oUVfM9ua2UtJ68sHKgCvNA,-YzMXeOVQfWAVXNAtMSbyw,dnQMntrmickWGYLB30KBEQ,4,0,0,0,Coffee is VERY good. My breakfast was a welcom...,2014-07-16 13:01:33
1,E7QcmW1jmB6T3HkSMdLGDA,jLNR8Tsvi47ENvoNfVYKiQ,4GGhj7Z99E5IYWdEqOsLUQ,5,0,0,0,"I've been coming to this place for 18 years, a...",2019-04-16 20:17:17
2,GgGLzyl408biArY9oLGbRQ,392lRckiPvP-xTZ10E5RPw,c3QxX3toWdqJnKQmmIliRQ,2,0,0,0,This place is a bit overrated. It is very tren...,2021-05-02 23:53:15
3,B-EtTJZH45iCGWDNU36-1Q,OIa6ptM1qUts5arovQUAFQ,-QI8Qi8XWH3D8y8ethnajA,2,6,1,2,"This is an older airport, and it reminded me a...",2018-04-13 15:51:03
4,RJb-x897_abr1CZDYiB1Xw,fwOETgbWmBAhdO9058e4Zg,C5ZOzlslhMxRJDjBDV3KoQ,5,0,0,0,Awesome. One of my favorites. They have less ...,2016-03-27 18:29:16


In [20]:
def build_subset(df, y, classes, distributions):
    if len(classes) != len(distributions):
        raise Exception('classes and distributions must be same length')
    
    dfs = []
    for i in range(len(classes)):
        dfs.append(df.loc[df[y] == classes[i]].sample(n=distributions[i]))

    return pd.concat(dfs)

In [59]:
df_12_345 = build_subset(df, 'stars', [1, 2, 3, 4, 5], [3800, 3800, 3800, 3800, 3800]) # make a dataset with all 5 star ratings, pulling 3800 samples from each rating
df_12_45 = build_subset(df, 'stars', [1, 2, 4, 5], [3800, 3800, 3800, 3800]) # make a dataset only considering 1, 2, 4, 5 star ratings
df_1_5 = build_subset(df, 'stars', [1, 5], [5500, 5500]) # make a dataset only considering 1 and 5 star ratings

In [45]:
def sklearn_log_regr(df):
    X = df[['text']]
    y = df['stars'].replace([1, 2], 0).replace([3, 4, 5], 1)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1)
    
    tfidf_vectorizer = TfidfVectorizer()
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train.text)
    X_test_tfidf = tfidf_vectorizer.transform(X_test.text)
    log_regr = LogisticRegression()
    log_regr.fit(X_train_tfidf, y_train)
    train_accuracy = log_regr.score(X_train_tfidf, y_train)
    test_accuracy = log_regr.score(X_test_tfidf, y_test)
    return round(train_accuracy, 4), round(test_accuracy, 4)

In [50]:
result_12_345 = sklearn_log_regr(df_12_345)
print("Logistic regression train accuracy for classifying 1 to 2 stars and 3 to 5 stars :", result_12_345[0])
print("Logistic regression test accuracy for classifying 1 to 2 stars and 3 to 5 stars :", result_12_345[1])

Logistic regression train accuracy for classifying 1 to 2 stars and 3 to 5 stars : 0.8997
Logistic regression test accuracy for classifying 1 to 2 stars and 3 to 5 stars : 0.8492


In [51]:
result_12_45 = sklearn_log_regr(df_12_45)
print("Logistic regression train accuracy for classifying 1 to 2 stars and 4 to 5 stars :", result_12_45[0])
print("Logistic regression test accuracy for classifying 1 to 2 stars and 4 to 5 stars :", result_12_45[1])

Logistic regression train accuracy for classifying 1 to 2 stars and 4 to 5 stars : 0.9505
Logistic regression test accuracy for classifying 1 to 2 stars and 4 to 5 stars : 0.9286


In [52]:
result_1_5 = sklearn_log_regr(df_1_5)
print("Logistic regression train accuracy for classifying 1 star and 5 stars :", result_1_5[0])
print("Logistic regression test accuracy for classifying 1 star and 5 stars :", result_1_5[1])

Logistic regression train accuracy for classifying 1 star and 5 stars : 0.9786
Logistic regression test accuracy for classifying 1 star and 5 stars : 0.9559


In [80]:
# From https://towardsdatascience.com/sentiment-analysis-using-logistic-regression-and-naive-bayes-16b806eb4c4b
#Frequency generating function
def build_freqs(reviews, stars):
    stars_list = np.squeeze(stars).tolist()
    
    freqs = {}
    for star, review in zip(stars_list, reviews.text):
        for word in review.split():
            pair = (word, star)
            freqs[pair] = freqs.get(pair, 0) + 1

    return freqs

def sigmoid(z): 
    h = 1/(1 + np.exp(-z))  
    return h

def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    Hint: you might want to print the cost to make sure that it is going down.
    '''
    
    m = len(x)
  
    for i in range(0, num_iters):
        
        # get z, the dot product of x and theta
        z = np.dot(x,theta)
        
        # get the sigmoid of z
        h = sigmoid(z)
        
        # calculate the cost function
        J = (-1/m)*(np.dot(y.T,np.log(h)) + np.dot((1-y).T,np.log(1-h)))
        
        # update the weights theta
        theta = theta - (alpha/m)*np.dot(x.T, h-y)
        
    J = float(J)
    return J, theta

def extract_features(review, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''    
    # 3 elements in the form of a 1 x 3 vector
    feature = np.zeros((1, 3)) 
    
    #bias term is set to 1
    feature[0,0] = 1 
        
    # loop through each word in the list of words
    for word in review.split:
        
        # increment the word count for the positive label 1
        feature[0,1] += freqs.get((word,1),0)
        
        # increment the word count for the negative label 0
        feature[0,2] += freqs.get((word,0),0)

    return feature

In [81]:
def log_regr(df):
    X = df[['text']]
    y = df['stars'].replace([1, 2], 0).replace([3, 4, 5], 1)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1)

    freqs = build_freqs(X_train, y_train)
    return freqs

print(log_regr(df_1_5))

