In [76]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import joblib

In [15]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/guinn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/guinn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/guinn/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
df = pd.read_excel("./cleaned_reviews.xlsx")
df.head()

Unnamed: 0,brand,model,price,Ratings,processed_reviews,Sentiment
0,Hey Dude,Wally Linen Natural,$64.99,5,bought son love wear almost everi day definit ...,Positive
1,Hey Dude,Wally Linen Natural,$64.99,5,get lot compliment,Positive
2,Hey Dude,Wally Linen Natural,$64.99,5,love dude,Positive
3,Hey Dude,Wally Linen Natural,$64.99,5,love,Positive
4,Hey Dude,Wally Linen Natural,$64.99,5,probabl favorit,Positive


In [23]:
# Preprocessing pipeline adapted from zappos_cleandata
class TextPreprocessor(BaseEstimator, TransformerMixin):
    
    # Preprocessing function
    @staticmethod
    def preprocess_text(text):
        if not isinstance(text, str):
            return ""


        # Convert to lowercase
        text = text.lower()

        # Remove punctuation and special characters
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

        # Tokenization using the correct 'punkt' resource
        tokens = word_tokenize(text)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [word for word in tokens if word not in stop_words]

        # Stemming (Porter Stemmer)
        stemmer = PorterStemmer()
        stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

        return ' '.join(stemmed_tokens)  # Return cleaned text as a string
    
    # Fit function does nothing but must return a value for fit_fransform to work in the pipeline
    def fit(self, X, y=None):
        return self
    
    # Overwrite the transform function like in the sklearn documentation on TransformerMixin
    def transform(self, X):
        return [TextPreprocessor.preprocess_text(text) for text in X]

In [65]:
# Encode the labels
label_encoder = LabelEncoder()
df['encoded_sentiment'] = label_encoder.fit_transform(df['Sentiment'])
df.head()

Unnamed: 0,brand,model,price,Ratings,processed_reviews,Sentiment,sentiment,Encoded_sentiment,encoded_sentiment
0,Hey Dude,Wally Linen Natural,$64.99,5,bought son love wear almost everi day definit ...,Positive,2,2,2
1,Hey Dude,Wally Linen Natural,$64.99,5,get lot compliment,Positive,2,2,2
2,Hey Dude,Wally Linen Natural,$64.99,5,love dude,Positive,2,2,2
3,Hey Dude,Wally Linen Natural,$64.99,5,love,Positive,2,2,2
4,Hey Dude,Wally Linen Natural,$64.99,5,probabl favorit,Positive,2,2,2


In [66]:
# Split the data into train and test
X = df['processed_reviews']
y = df['encoded_sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
len(X_train), len(y_train), len(X_test), len(y_test)

(2367, 2367, 592, 592)

In [67]:
# Create a pipeline, tfidf vectorizer to convert comments to features
pipeline_naives = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('tfidf', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

In [68]:
# Train the model
pipeline.fit(X_train, y_train)

In [69]:
# Evaluating the model
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))

Accuracy: 0.7989864864864865
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        80
           1       0.00      0.00      0.00        39
           2       0.80      1.00      0.89       473

    accuracy                           0.80       592
   macro avg       0.27      0.33      0.30       592
weighted avg       0.64      0.80      0.71       592



In [70]:
# Logistic regression
pipeline_regression = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('tfidf', TfidfVectorizer()),
    ('classifier', LogisticRegression())
])

In [71]:
# Train the model
pipeline_regression.fit(X_train, y_train)

In [72]:
# Evaluating the model
y_pred = pipeline_regression.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8243243243243243
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.23      0.33        80
           1       0.50      0.03      0.05        39
           2       0.84      0.99      0.91       473

    accuracy                           0.82       592
   macro avg       0.65      0.41      0.43       592
weighted avg       0.78      0.82      0.77       592



In [73]:
# Linear Support vector machines
pipeline_linear_svc = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('tfidf', TfidfVectorizer()),
    ('classifier', LinearSVC())
])

In [74]:
# Train the model
pipeline_linear_svc.fit(X_train, y_train)

In [75]:
# Evaluating the model
y_pred = pipeline_linear_svc.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.831081081081081
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.39      0.47        80
           1       0.23      0.13      0.16        39
           2       0.88      0.96      0.92       473

    accuracy                           0.83       592
   macro avg       0.57      0.49      0.52       592
weighted avg       0.80      0.83      0.81       592



In [77]:
# Save the pipeline and label encoder to a file
joblib.dump(pipeline, "linear_svc_model.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']