# Sentiment Analysis on different platforms

In [234]:
import json
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    f1_score,
    roc_auc_score,
    average_precision_score
)
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

In [235]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
dataset = "yelp"
input_path = f"./{dataset}.csv"

## Feature Engineering and Model Selection

In [237]:
vectorizer = TfidfVectorizer(
        strip_accents='unicode', lowercase=True, stop_words='english', max_features=10000)

In [238]:
def tfidf(train, test):
    train = vectorizer.fit_transform(train)
    test = vectorizer.transform(test)

    return pd.DataFrame(train.toarray(), columns=vectorizer.get_feature_names_out()), pd.DataFrame(test.toarray(), columns=vectorizer.get_feature_names_out())

In [239]:
def load_data():
    data = pd.read_csv(input_path)
    data = data.dropna()

    if len(data) > 20000:
        print("Sampling 15000 rows from the dataset for faster processing...")
        data = data.sample(n=20000, random_state=42)

    # Rename 'Comment' column to 'text' for consistency
    data = data.rename(columns={'Comment': 'text'})
    
    return data

In [240]:
data = load_data()

Sampling 15000 rows from the dataset for faster processing...


In [241]:
data.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool,Sentiment
1295256,J5Q1gH4ACCj6CtQG7Yom7g,56gL9KEJNHiSDUoyjk2o3Q,8yR12PNSMo6FBYx1u5KPlw,2.0,2018-04-04 21:09:53,Went for lunch and found that my burger was me...,1,0,0,-1
3297618,HlXP79ecTquSVXmjM10QxQ,bAt9OUFX9ZRgGLCXG22UmA,pBNucviUkNsiqhJv5IFpjg,5.0,2020-05-24 12:22:14,I needed a new tires for my wife's car. They h...,0,0,0,1
1217795,JBBULrjyGx6vHto2osk_CQ,NRHPcLq2vGWqgqwVugSgnQ,8sf9kv6O4GgEb0j1o22N1g,5.0,2019-02-14 03:47:48,Jim Woltman who works at Goleta Honda is 5 sta...,0,0,0,1
3730348,U9-43s8YUl6GWBFCpxUGEw,PAxc0qpqt5c2kA0rjDFFAg,XwepyB7KjJ-XGJf0vKc6Vg,4.0,2013-04-27 01:55:49,Been here a few times to get some shrimp. The...,0,0,0,1
1826590,8T8EGa_4Cj12M6w8vRgUsQ,BqPR1Dp5Rb_QYs9_fz9RiA,prm5wvpp0OHJBlrvTj9uOg,5.0,2019-05-15 18:29:25,This is one fantastic place to eat whether you...,0,0,0,1


In [242]:
def split_data(data):
    # select only 'text' and 'Sentiment' columns
    data = data[['text', 'Sentiment']].copy()

    train, test = train_test_split(data, train_size=0.8)

    sentiment_keys = {'positive', 'neutral', 'negative'}
    if set(train['Sentiment'].unique()).issubset(sentiment_keys) and set(test['Sentiment'].unique()).issubset(sentiment_keys):
        train['Sentiment'] = train['Sentiment'].map({'positive': 1, 'neutral': 0, 'negative': -1})
        test['Sentiment'] = test['Sentiment'].map({'positive': 1, 'neutral': 0, 'negative': -1})

    train = train.dropna(subset=['text', 'Sentiment'])
    test = test.dropna(subset=['text', 'Sentiment'])

    # Ensure 'text' column is of type string
    train['text'] = train['text'].astype(str)
    test['text'] = test['text'].astype(str)
    train = train[train['text'].str.strip() != ""]
    test = test[test['text'].str.strip() != ""]

    yTrain, yTest = train['Sentiment'], test['Sentiment']
    xTrain, xTest = tfidf(train['text'], test['text'])

    return xTrain, yTrain, xTest, yTest

In [243]:
def feature_selection(xTrain, xTest, yTrain):
    features = np.array(xTrain.columns)
    selector = SelectKBest(chi2, k=2000)

    xTrain = selector.fit_transform(xTrain, yTrain)
    xTest = selector.transform(xTest)

    selected_features = features[selector.get_support()]

    xTrain = pd.DataFrame(xTrain, columns=selected_features)
    xTest = pd.DataFrame(xTest, columns=selected_features)
    return xTrain, xTest

In [244]:
xTrain, yTrain, xTest, yTest = split_data(data)
print("Original Training Shape:", xTrain.shape)
xTrain, xTest = feature_selection(xTrain, xTest, yTrain)
print("Transformed Training Shape:", xTrain.shape)

Original Training Shape: (16000, 10000)
Transformed Training Shape: (16000, 2000)


In [245]:
def eval_randomsearch(clf, pgrid, xTrain, yTrain, xTest, yTest):
    """
    Given a sklearn classifier and a parameter grid to search,
    choose the optimal parameters from pgrid using Random Search CV
    and train the model using the training dataset and evaluate the
    performance on the test dataset. The random search cv should try
    at most 33% of the possible combinations.

    Parameters
    ----------
    clf : sklearn.ClassifierMixin
        The sklearn classifier model 
    pgrid : dict
        The dictionary of parameters to tune for in the model
    xTrain : nd-array with shape (n, d)
        Training data
    yTrain : 1d array with shape (n, )
        Array of labels associated with training data
    xTest : nd-array with shape (m, d)
        Test data
    yTest : 1d array with shape m
        Array of labels associated with test data.

    Returns
    -------
    resultDict: dict
        A Python dictionary with the following 4 keys,
        "AUC", "AUPRC", "F1", "Time" and the values are the floats
        associated with them for the test set.
    roc : dict
        A Python dictionary with 2 keys, fpr, and tpr, where
        each of the values are lists of the fpr and tpr associated
        with different thresholds. You should be able to use this
        to plot the ROC for the model performance on the test curve.
    bestParams: dict
        A Python dictionary with the best parameters chosen by your
        GridSearch. The values in the parameters should be something
        that was in the original pgrid.
    """
    permutations = np.prod([len(v) for v in pgrid.values()])
    start = time.time()

    cv = RandomizedSearchCV(clf, param_distributions=pgrid, n_iter=int(permutations*0.33), cv=10)
    cv.fit(xTrain, yTrain)

    timeElapsed = time.time() - start

    clf = cv.best_estimator_
    best_params = cv.best_params_

    yHat = clf.predict(xTest)
    yHat_proba = clf.predict_proba(xTest)

    auc = roc_auc_score(yTest, yHat_proba, multi_class='ovr')

    auprc = average_precision_score(yTest, yHat_proba)

    f1 = f1_score(yTest, yHat, average='weighted')

    # # training evaluation
    # yTrainHat = clf.predict(xTrain)
    # yTrainHat_proba = clf.predict_proba(xTrain)

    # train_auc = roc_auc_score(yTrain, yTrainHat_proba, multi_class='ovr')

    # train_auprc = average_precision_score(yTrain, yTrainHat_proba)

    # train_f1 = f1_score(yTrain, yTrainHat, average='weighted')

    # # plot
    # plt.figure()
    # plt.plot()

    return {'AUC': auc, 'AUPRC': auprc, 'F1': f1, 'Time': timeElapsed}, best_params


def eval_searchcv(clfName, clf, clfGrid,
                  xTrain, yTrain, xTest, yTest,
                  perfDict, bestParamDict):
    # evaluate random search and add to perfDict
    clfr_perf, rs_p  = eval_randomsearch(clf, clfGrid, xTrain,
                                            yTrain, xTest, yTest)
    perfDict[clfName + " (Random)"] = clfr_perf
    bestParamDict[clfName] = {"Random": rs_p}
    return perfDict, bestParamDict


def get_parameter_grid(mName):
    """
    Given a model name, return the parameter grid associated with it

    Parameters
    ----------
    mName : string
        name of the model (e.g., DT, KNN, LR (None))

    Returns
    -------
    pGrid: dict
        A Python dictionary with the appropriate parameters for the model.
        The dictionary should have at least 2 keys and each key should have
        at least 2 values to try.
    """
    if mName == 'DT':
        return {'max_depth': [27, 28, 29], 'min_samples_leaf': [13, 14, 15]}
    elif mName == 'LR (None)':
        return {'C': [10, 100], 'tol': [0.0003, 0.0005]}
    elif mName == 'LR (L1)':
        return {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'tol': [0.0002, 0.0003]}
    elif mName == 'LR (L2)':
        return {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'tol': [0.0003, 0.0004]}
    elif mName == 'KNN':
        return {'n_neighbors': [30, 32, 36], 'p': [1, 2]}
    elif mName == 'NN':
        return {'alpha':[0.001, 0.01], 'hidden_layer_sizes': [(50,50),(100,),(200,100)]}


def evaluate_models():
    perfDict = {}
    bestParamDict = {}

    print("Tuning Decision Tree --------")
    # Compare Decision Tree
    dtName = "DT"
    dtGrid = get_parameter_grid(dtName)
    # fill in
    dtClf = DecisionTreeClassifier()
    perfDict, bestParamDict = eval_searchcv(dtName, dtClf, dtGrid,
                                                   xTrain, yTrain, xTest, yTest,
                                                   perfDict, bestParamDict)
    print("Tuning Unregularized Logistic Regression --------")
    # logistic regression (unregularized)
    unregLrName = "LR (None)"
    unregLrGrid = get_parameter_grid(unregLrName)
    # fill in
    lrClf = LogisticRegression(max_iter=500)
    perfDict, bestParamDict = eval_searchcv(unregLrName, lrClf, unregLrGrid,
                                                   xTrain, yTrain, xTest, yTest,
                                                   perfDict, bestParamDict)
    # logistic regression (L1)
    print("Tuning Logistic Regression (Lasso) --------")
    lassoLrName = "LR (L1)"
    lassoLrGrid = get_parameter_grid(lassoLrName)
    # fill in
    lassoClf = LogisticRegression(penalty='l1', solver='liblinear', max_iter=500)
    perfDict, bestParamDict = eval_searchcv(lassoLrName, lassoClf, lassoLrGrid,
                                                   xTrain, yTrain, xTest, yTest,
                                                   perfDict, bestParamDict)
    # Logistic regression (L2)
    print("Tuning Logistic Regression (Ridge) --------")
    ridgeLrName = "LR (L2)"
    ridgeLrGrid = get_parameter_grid(ridgeLrName)
    # fill in
    ridgeClf = LogisticRegression(penalty='l2', max_iter=500)
    perfDict, bestParamDict = eval_searchcv(ridgeLrName, ridgeClf, ridgeLrGrid,
                                                   xTrain, yTrain, xTest, yTest,
                                                   perfDict, bestParamDict)
    # k-nearest neighbors
    print("Tuning K-nearest neighbors --------")
    knnName = "KNN"
    knnGrid = get_parameter_grid(knnName)
    # fill in
    knnClf = KNeighborsClassifier()
    perfDict, bestParamDict = eval_searchcv(knnName, knnClf, knnGrid,
                                                   xTrain, yTrain, xTest, yTest,
                                                   perfDict, bestParamDict)
    # neural networks
    print("Tuning neural networks --------")
    nnName = "NN"
    nnGrid = get_parameter_grid(nnName)
    # fill in
    nnClf = MLPClassifier(max_iter=200, early_stopping=True)
    perfDict, bestParamDict = eval_searchcv(nnName, nnClf, nnGrid,
                                                   xTrain, yTrain, xTest, yTest,
                                                   perfDict, bestParamDict)
    perfDF = pd.DataFrame.from_dict(perfDict, orient='index')
    print(perfDF)
    # store the best parameters
    with open('parameters', 'w') as f:
        json.dump(bestParamDict, f)

    # return the model with the best AUC
    bestModel = perfDF['AUC'].idxmax()
    print(f"Best Model: {bestModel}")
    print(f"Best Parameters: {bestParamDict[bestModel.split(' ')[0]]}")


In [246]:
evaluate_models()

Tuning Decision Tree --------
Tuning Unregularized Logistic Regression --------
Tuning Logistic Regression (Lasso) --------
Tuning Logistic Regression (Ridge) --------
Tuning K-nearest neighbors --------
Tuning neural networks --------
                         AUC     AUPRC        F1       Time
DT (Random)         0.790388  0.576409  0.731753  27.075988
LR (None) (Random)  0.903309  0.718162  0.834571  13.655216
LR (L1) (Random)    0.912704  0.737486  0.813294   4.934810
LR (L2) (Random)    0.903309  0.718162  0.834571  14.697934
KNN (Random)        0.608475  0.394447  0.544352  38.304248
NN (Random)         0.917002  0.738619  0.821818  24.958419
Best Model: NN (Random)
Best Parameters: {'Random': {'hidden_layer_sizes': (50, 50), 'alpha': 0.001}}


In [255]:
# selected_model = MLPClassifier(max_iter=200, early_stopping=True, hidden_layer_sizes=(50, 50), alpha=0.01) # youtube
selected_model = MLPClassifier(max_iter=200, early_stopping=True, hidden_layer_sizes=(50, 50), alpha=0.001) # yelp

## Identify Common Misclassified Patterns

In [256]:
selected_model.fit(xTrain, yTrain)

In [257]:
def get_misclassified_samples(model, xTest, yTest):
    yPred = model.predict(xTest)
    misclassified_indices = np.where(yPred != yTest)[0]
    
    misclassified_samples = xTest.iloc[misclassified_indices].copy()
    misclassified_samples['True Label'] = yTest.iloc[misclassified_indices].values
    misclassified_samples['Predicted Label'] = yPred[misclassified_indices]
    
    return misclassified_samples

In [258]:
misclassified = get_misclassified_samples(selected_model, xTest, yTest)
misclassified_df = pd.DataFrame(misclassified)

In [259]:
# convert misclassified features from vectorized form to original text
def vector_to_text(vectorized_comments, vectorizer):
    feature_names = vectorizer.get_feature_names_out()
    comments = []
    for index, row in vectorized_comments.iterrows():
        comment = " ".join([feature_names[i] for i in range(len(row)) if row[i] > 0])
        comments.append(comment)
    return comments

In [260]:
def get_most_common_words(df, n=100):
    all_comments = ' '.join(df['text'])
    words = all_comments.split()
    word_counts = pd.Series(words).value_counts()
    return word_counts.head(n)

In [261]:
misclassified_comments = vector_to_text(misclassified_df.drop(columns=['True Label', 'Predicted Label']), vectorizer)
misclassified_df['text'] = misclassified_comments
most_common_words = get_most_common_words(misclassified_df)

In [None]:
# Save most common words to a CSV file
most_common_words_df = pd.DataFrame(most_common_words).reset_index()
most_common_words_df.columns = ['Word', 'Count']
most_common_words_df.to_csv(f'most_common_words_{dataset}.csv', index=False)