In [64]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import joblib

In [2]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/guinn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/guinn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/guinn/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
df = pd.read_excel("./cleaned_zappos_men.xlsx")
df.head()

Unnamed: 0,brand,model,price,reviews,Ratings
0,Hey Dude,Wally Linen Natural,$64.99,Bought these for my son and he LOVES them!!! H...,5
1,Hey Dude,Wally Linen Natural,$64.99,I get lots of compliments for these .,5
2,Hey Dude,Wally Linen Natural,$64.99,I love my Dude's!,5
3,Hey Dude,Wally Linen Natural,$64.99,Love these!!,5
4,Hey Dude,Wally Linen Natural,$64.99,Probably my favorite,5


In [6]:
df["Ratings"].value_counts()

Ratings
5    2127
4     254
3     215
1     184
2     181
Name: count, dtype: int64

In [91]:
# Set target size (average between max & min) for hybrid sampling
target_size = df['Ratings'].value_counts().min()
target_size

np.int64(181)

In [92]:
df_balanced = df.groupby("Ratings", group_keys=False)[[col for col in df.columns]].apply(lambda x: x.sample(target_size, replace=len(x) < target_size))

In [93]:
df_balanced.head()

Unnamed: 0,brand,model,price,reviews,Ratings
276,Merrell Work,Moab 3 Response Tactical,$125.46,I'm on my feet all day and thought I would giv...,1
2841,Birkenstock,Bend - Leather (Unisex),$160.00,"I am a long time Birkenstock wearer, still hav...",1
1355,Propet,Parker,$97.71,"I sent them back along with another pair, both...",1
643,etnies,Barge LS,$64.95,Enjoyed these shoes until they fell apart. I k...,1
2500,Hoka,Clifton L Athletics,$149.95,This is my 4th pair of Hoka's. This shoe is ve...,1


In [94]:
df_balanced["Ratings"].value_counts()

Ratings
1    181
2    181
3    181
4    181
5    181
Name: count, dtype: int64

In [95]:
def create_sentiment(rating):
    match rating:
        case 1:
            return "Very Negative"
        case 2:
            return "Negative"
        case 3:
            return "Neutral"
        case 4:
            return "Positive"
        case 5:
            return "Very Positive"
        case _:
            return "Invalid Score"

In [96]:
df_balanced["sentiments"] = df["Ratings"].apply(create_sentiment)
df_balanced.head()

Unnamed: 0,brand,model,price,reviews,Ratings,sentiments
276,Merrell Work,Moab 3 Response Tactical,$125.46,I'm on my feet all day and thought I would giv...,1,Very Negative
2841,Birkenstock,Bend - Leather (Unisex),$160.00,"I am a long time Birkenstock wearer, still hav...",1,Very Negative
1355,Propet,Parker,$97.71,"I sent them back along with another pair, both...",1,Very Negative
643,etnies,Barge LS,$64.95,Enjoyed these shoes until they fell apart. I k...,1,Very Negative
2500,Hoka,Clifton L Athletics,$149.95,This is my 4th pair of Hoka's. This shoe is ve...,1,Very Negative


In [97]:
# Preprocessing pipeline adapted from zappos_cleandata
class TextPreprocessor(BaseEstimator, TransformerMixin):
    
    # Preprocessing function
    @staticmethod
    def preprocess_text(text):
        if not isinstance(text, str):
            return ""


        # Convert to lowercase
        text = text.lower()

        # Remove punctuation and special characters
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

        # Tokenization using the correct 'punkt' resource
        tokens = word_tokenize(text)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [word for word in tokens if word not in stop_words]

        # Stemming (Porter Stemmer)
        stemmer = PorterStemmer()
        stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

        return ' '.join(stemmed_tokens)  # Return cleaned text as a string
    
    # Fit function does nothing but must return a value for fit_fransform to work in the pipeline
    def fit(self, X, y=None):
        return self
    
    # Overwrite the transform function like in the sklearn documentation on TransformerMixin
    def transform(self, X):
        return [TextPreprocessor.preprocess_text(text) for text in X]

In [98]:
# Encode the labels
label_encoder = LabelEncoder()
df_balanced['encoded_sentiment'] = label_encoder.fit_transform(df_balanced['sentiments'])
# df_balanced["encoded_sentiment"].value_counts()
df_balanced.head()

Unnamed: 0,brand,model,price,reviews,Ratings,sentiments,encoded_sentiment
276,Merrell Work,Moab 3 Response Tactical,$125.46,I'm on my feet all day and thought I would giv...,1,Very Negative,3
2841,Birkenstock,Bend - Leather (Unisex),$160.00,"I am a long time Birkenstock wearer, still hav...",1,Very Negative,3
1355,Propet,Parker,$97.71,"I sent them back along with another pair, both...",1,Very Negative,3
643,etnies,Barge LS,$64.95,Enjoyed these shoes until they fell apart. I k...,1,Very Negative,3
2500,Hoka,Clifton L Athletics,$149.95,This is my 4th pair of Hoka's. This shoe is ve...,1,Very Negative,3


In [99]:
label_encoder.inverse_transform([3])

array(['Very Negative'], dtype=object)

In [100]:
# Split the data into train and test
X = df_balanced['reviews']
y = df_balanced['encoded_sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
len(X_train), len(y_train), len(X_test), len(y_test)

(724, 724, 181, 181)

In [101]:
# Create a pipeline, tfidf vectorizer to convert comments to features
pipeline_naives = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('tfidf', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

In [102]:
# Train the model
pipeline_naives.fit(X_train, y_train)

In [103]:
# Evaluating the model
y_pred = pipeline_naives.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))

Accuracy: 0.425414364640884
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.26      0.35        43
           1       0.24      0.38      0.29        32
           2       0.33      0.50      0.40        34
           3       0.46      0.34      0.39        35
           4       0.71      0.68      0.69        37

    accuracy                           0.43       181
   macro avg       0.47      0.43      0.43       181
weighted avg       0.48      0.43      0.43       181



In [104]:
# Logistic regression
pipeline_regression = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('tfidf', TfidfVectorizer()),
    ('classifier', LogisticRegression())
])

In [105]:
# Train the model
pipeline_regression.fit(X_train, y_train)

In [106]:
# Evaluating the model
y_pred = pipeline_regression.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.43646408839779005
Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.26      0.35        43
           1       0.27      0.31      0.29        32
           2       0.36      0.47      0.41        34
           3       0.39      0.46      0.42        35
           4       0.67      0.70      0.68        37

    accuracy                           0.44       181
   macro avg       0.45      0.44      0.43       181
weighted avg       0.46      0.44      0.43       181



In [107]:
# Linear Support vector machines
pipeline_linear_svc = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('tfidf', TfidfVectorizer()),
    ('classifier', LinearSVC())
])

In [108]:
# Train the model
pipeline_linear_svc.fit(X_train, y_train)

In [109]:
# Evaluating the model
y_pred = pipeline_linear_svc.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.430939226519337
Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.33      0.41        43
           1       0.26      0.31      0.29        32
           2       0.36      0.47      0.41        34
           3       0.39      0.34      0.36        35
           4       0.63      0.70      0.67        37

    accuracy                           0.43       181
   macro avg       0.44      0.43      0.43       181
weighted avg       0.45      0.43      0.43       181



In [110]:
# Save the pipeline and label encoder to a file
joblib.dump(pipeline_linear_svc, "linear_svc_model.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']