In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import PorterStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_auc_score

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#**Logistic Regression with TF-IDF Vectorization**




##**Step 1 - Data Preparation**

I first save data in pandas dataframe and then split it into training, validation and test set.

In [None]:
# Load our Excel data into a pandas DataFrame
df = pd.read_csv('Review.csv')

# Select only the 'review' and 'rating' columns
df_filtered = df[['Review', 'Rating']]

val = df_filtered['Rating'].value_counts()[1]

# Get 10,000 instances of reviews with rating 1
df_rating_1 = df_filtered[df_filtered['Rating'] == 1].sample(n=val, random_state=1)

# Get 10,000 random samples for ratings 2, 3, 4, and 5 to balance the dataset
df_rating_2 = df_filtered[df_filtered['Rating'] == 2].sample(n=val, random_state=1)
df_rating_4 = df_filtered[df_filtered['Rating'] == 4].sample(n=val, random_state=1)
df_rating_5 = df_filtered[df_filtered['Rating'] == 5].sample(n=val, random_state=1)

# Combine the samples into a single DataFrame
balanced_df = pd.concat([df_rating_1, df_rating_2, df_rating_4, df_rating_5])

# Relabel the ratings
# 1 and 2 -> neg, 3 -> nil, 4 and 5 -> pos
def relabel_rating(rating):
    if rating in [1, 2]:
        return 'neg'
    elif rating in [4, 5]:
        return 'pos'

# Apply the relabeling function to the 'rating' column
balanced_df['Rating'] = balanced_df['Rating'].apply(relabel_rating)

# Reset the index for neatness
balanced_df.reset_index(drop=True, inplace=True)

# Removing nan values
balanced_df.dropna(inplace=True)

# Display the first few rows of the final DataFrame
print(balanced_df.head())

                                              Review Rating
0  Our 2008 Town & Country shuts off while drivin...    neg
1  I purchased this new in 2012 and paid cash for...    neg
2  Update:  12/28/2019 - GPS/INFOTAINMENT SCREEN ...    neg
3  I thought I was getting a good deal. A mint fu...    neg
4  I have had a rattle in my new VW atlas after t...    neg


In [None]:
#Creating a randomized vector whose length is the length of our dataset
total_count = balanced_df.shape[0]
np.random.seed(0)
shuffle = np.random.permutation(total_count)

#Splitting the dataset into 'x' and 'y', so that it can be used in our model
#'x' represents the reviews and 'y' represents the sentiments
x = balanced_df.iloc[shuffle, 0]
y = balanced_df.iloc[shuffle, 1]

#splitting the dataset in training and testing sets in a 80:20 ratio
split = int(total_count * 0.6) + 1  #the required 80% split
split_2 = int(total_count *0.8) + 1
x_train = x[:split]
y_train = y[:split]

x_val = x[split:split_2]
y_val = y[split:split_2]

x_test = x[split_2:]
y_test = y[split_2:]



---


**review_cleaner** function performs stemming on our reviews, turns them into lowercase and then also removes stop words. Returns a simplified sentence that can be used for GridSearchCV

In [None]:
def review_cleaner(review):
  stopwords = nltk.corpus.stopwords.words("english")
  porter = PorterStemmer()
  # Make sure the reviews are not case sensitive
  review = review.lower()
  # Tokenize the words from the review
  words = nltk.word_tokenize(review)
  # Stemming and stopwords removal
  processed_words = [porter.stem(word) for word in words if word not in stopwords]
  # Join back to a single string
  return ' '.join(processed_words)



---
##**Step 2 - Hyperparameter Tuning using GridSearchCV**


I used the **GridSearchCV** function from sklearn library to perform hyperparameter tuning and choose the optimal values for n-gram size, C and l1 ratio.





In [None]:
# Updated param grids with limited ranges
param_distributions = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'model__C': [0.01, 0.1, 1, 10],
    'model__l1_ratio': [0.1, 0.5, 0.9]
}

# Pipeline definition
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=review_cleaner, max_features = 7000)),
    ('model', LogisticRegression(max_iter=200, solver='saga', penalty = 'elasticnet'))
])

# Randomized search with parallel processing and reduced sample size
random_search = GridSearchCV(
    pipeline,
    param_grid=param_distributions,
    scoring='f1_weighted',
    refit=True,
    cv=3,  # Reduced to 2-fold CV for faster tuning
    verbose=2,
)

# Fit on a sample of the training data
random_search.fit(x_train, y_train)

# Output best parameters and score
print("Best parameters found:", random_search.best_params_)
print("Best F1 score:", random_search.best_score_)


Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] END model__C=0.01, model__l1_ratio=0.1, tfidf__ngram_range=(1, 1); total time= 1.4min
[CV] END model__C=0.01, model__l1_ratio=0.1, tfidf__ngram_range=(1, 1); total time= 1.4min
[CV] END model__C=0.01, model__l1_ratio=0.1, tfidf__ngram_range=(1, 1); total time= 1.4min
[CV] END model__C=0.01, model__l1_ratio=0.1, tfidf__ngram_range=(1, 2); total time= 1.5min
[CV] END model__C=0.01, model__l1_ratio=0.1, tfidf__ngram_range=(1, 2); total time= 1.5min
[CV] END model__C=0.01, model__l1_ratio=0.1, tfidf__ngram_range=(1, 2); total time= 1.5min
[CV] END model__C=0.01, model__l1_ratio=0.1, tfidf__ngram_range=(1, 3); total time= 1.8min
[CV] END model__C=0.01, model__l1_ratio=0.1, tfidf__ngram_range=(1, 3); total time= 1.6min
[CV] END model__C=0.01, model__l1_ratio=0.1, tfidf__ngram_range=(1, 3); total time= 1.6min
[CV] END model__C=0.01, model__l1_ratio=0.5, tfidf__ngram_range=(1, 1); total time= 1.4min
[CV] END model__C=0.01, mode



[CV] END model__C=10, model__l1_ratio=0.9, tfidf__ngram_range=(1, 1); total time= 2.8min




[CV] END model__C=10, model__l1_ratio=0.9, tfidf__ngram_range=(1, 1); total time= 2.8min




[CV] END model__C=10, model__l1_ratio=0.9, tfidf__ngram_range=(1, 1); total time= 2.8min




[CV] END model__C=10, model__l1_ratio=0.9, tfidf__ngram_range=(1, 2); total time= 3.0min




[CV] END model__C=10, model__l1_ratio=0.9, tfidf__ngram_range=(1, 2); total time= 3.0min




[CV] END model__C=10, model__l1_ratio=0.9, tfidf__ngram_range=(1, 2); total time= 3.0min




[CV] END model__C=10, model__l1_ratio=0.9, tfidf__ngram_range=(1, 3); total time= 3.2min




[CV] END model__C=10, model__l1_ratio=0.9, tfidf__ngram_range=(1, 3); total time= 3.2min




[CV] END model__C=10, model__l1_ratio=0.9, tfidf__ngram_range=(1, 3); total time= 3.1min
Best parameters found: {'model__C': 1, 'model__l1_ratio': 0.1, 'tfidf__ngram_range': (1, 3)}
Best F1 score: 0.9205718749216212




---


##**Step 3 - Model fitting, prediction and performance scores**

The model(s) were fit on just the training set to save time. However, once the optimized values were found, the model with the optimal parameters was fit on training and validaiton sets.

In [None]:
# Refit on x_train + x_val with best parameters
best_model = random_search.best_estimator_
best_model.fit(np.concatenate([x_train, x_val]), np.concatenate([y_train, y_val]))

In [None]:
y_test_pred = best_model.predict(x_test)
new_test_f1_score = f1_score(y_test, y_test_pred, average='weighted')
new_test_accuracy = accuracy_score(y_test, y_test_pred)

print("Test F1 Score:", new_test_f1_score)
print("Test Accuracy:", new_test_accuracy)

Test F1 Score: 0.9289015252487914
Test Accuracy: 0.9289045614833352




---


Since the hyperparameter tuning for C = 10, l1_ratio = 0.9 ran out because of iteration limits, I ran it again using a higher iteration cap.

In [None]:
# Updated param grids with limited ranges
param_distributions = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
}

# Pipeline definition
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=review_cleaner, max_features = 7000)),
    ('model', LogisticRegression(max_iter=500, solver='saga', penalty = 'elasticnet', C = 10, l1_ratio = 0.9))
])

# Randomized search with parallel processing and reduced sample size
random_search = GridSearchCV(
    pipeline,
    param_grid=param_distributions,
    scoring='f1_weighted',
    refit=True,
    cv=3,  # Reduced to 2-fold CV for faster tuning
    verbose=2,
)

# Fit on a sample of the training data
random_search.fit(np.concatenate([x_train, x_val]), np.concatenate([y_train, y_val]))

# Output best parameters and score
print("Best parameters found:", random_search.best_params_)
print("Best F1 score:", random_search.best_score_)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] END ..........................tfidf__ngram_range=(1, 1); total time= 4.7min
[CV] END ..........................tfidf__ngram_range=(1, 1); total time= 4.5min
[CV] END ..........................tfidf__ngram_range=(1, 1); total time= 4.5min
[CV] END ..........................tfidf__ngram_range=(1, 2); total time= 4.9min
[CV] END ..........................tfidf__ngram_range=(1, 2); total time= 4.7min
[CV] END ..........................tfidf__ngram_range=(1, 2); total time= 4.7min
[CV] END ..........................tfidf__ngram_range=(1, 3); total time= 5.1min
[CV] END ..........................tfidf__ngram_range=(1, 3); total time= 4.7min
[CV] END ..........................tfidf__ngram_range=(1, 3); total time= 5.1min
Best parameters found: {'tfidf__ngram_range': (1, 2)}
Best F1 score: 0.9173417245573581


In [None]:
model_2 = random_search.best_estimator_
y_test_pred_2 = model_2.predict(x_test)
poss_test_f1_score = f1_score(y_test, y_test_pred_2, average='weighted')
poss_test_accuracy = accuracy_score(y_test, y_test_pred_2)

print("Test F1 Score:", poss_test_f1_score)
print("Test Accuracy:", poss_test_accuracy)

Test F1 Score: 0.919119421843731
Test Accuracy: 0.9191218291434422




---


Used ROC-AUC on best_model because it had better results than model_2

In [None]:
# Predict probabilities for the test set on the best_model
y_test_proba = best_model.predict_proba(x_test)[:,1]
# Calculate ROC-AUC score for the test set
test_roc_auc = roc_auc_score(np.array(y_test), y_test_proba, multi_class='ovr', average='weighted')
print("Test ROC-AUC Score:", test_roc_auc)

Test ROC-AUC Score: 0.9784484922795849




---




#**Linear SVM Model**

##**Step 1 - Re-split the data**

I re-split the data into just training and test set. This will take more time, but will make use of all datapoints at our disposal.

In [None]:
#Creating a randomized vector whose length is the length of our dataset
total_count = balanced_df.shape[0]
np.random.seed(0)
shuffle = np.random.permutation(total_count)

#Splitting the dataset into 'x' and 'y', so that it can be used in our model
#'x' represents the reviews and 'y' represents the sentiments
x = balanced_df.iloc[shuffle, 0]
y = balanced_df.iloc[shuffle, 1]

#splitting the dataset in training and testing sets in a 80:20 ratio
split = int(total_count *0.8) + 1
x_train = x[:split]
y_train = y[:split]

x_test = x[split:]
y_test = y[split:]

##**Step 2 - Hyperparameter tuning using GridSearchCV**

In [None]:
# Parameter grid with n-gram range, penalty, and C values
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],    # Unigram, bigram, and trigram
    'model__penalty': ['l1', 'l2'],                          # 'l1' only available if 'dual=False' in LinearSVC
    'model__C': [0.01, 0.1, 1, 10],                    # Regularization parameter
}

# Pipeline setup
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=review_cleaner, max_features=7000)),  # Include your review cleaning function here
    ('model', LinearSVC(max_iter=2000))
])

# Grid search with 3-fold cross-validation
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    scoring='f1_weighted',
    refit=True,
    cv=3,          # 3-fold cross-validation
    verbose=2,
)

# Fit the grid search on training data
grid_search.fit(x_train, y_train)

# Output best parameters and score
print("Best parameters found:", grid_search.best_params_)
print("Best F1 score:", grid_search.best_score_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END model__C=0.01, model__penalty=l1, tfidf__ngram_range=(1, 1); total time= 2.2min
[CV] END model__C=0.01, model__penalty=l1, tfidf__ngram_range=(1, 1); total time= 2.0min
[CV] END model__C=0.01, model__penalty=l1, tfidf__ngram_range=(1, 1); total time= 2.1min
[CV] END model__C=0.01, model__penalty=l1, tfidf__ngram_range=(1, 2); total time= 2.1min
[CV] END model__C=0.01, model__penalty=l1, tfidf__ngram_range=(1, 2); total time= 2.2min
[CV] END model__C=0.01, model__penalty=l1, tfidf__ngram_range=(1, 2); total time= 2.1min
[CV] END model__C=0.01, model__penalty=l1, tfidf__ngram_range=(1, 3); total time= 2.3min
[CV] END model__C=0.01, model__penalty=l1, tfidf__ngram_range=(1, 3); total time= 2.3min
[CV] END model__C=0.01, model__penalty=l1, tfidf__ngram_range=(1, 3); total time= 2.3min
[CV] END model__C=0.01, model__penalty=l2, tfidf__ngram_range=(1, 1); total time= 2.0min
[CV] END model__C=0.01, model__penalty=l2, tfidf_

##**Step 3 - Model fitting, prediction and performance scores**

In [None]:
svm_best_model = grid_search.best_estimator_
y_svmtest_pred = svm_best_model.predict(x_test)
svm_test_f1_score = f1_score(y_test, y_svmtest_pred, average='weighted')
svm_test_accuracy = accuracy_score(y_test, y_svmtest_pred)

print("Test F1 Score:", svm_test_f1_score)
print("Test Accuracy:", svm_test_accuracy)

# Predict probabilities for the test set on the best_model
y_svmtest_proba = svm_best_model.decision_function(x_test)
# Calculate ROC-AUC score for the test sets
svmtest_roc_auc = roc_auc_score(np.array(y_test), y_svmtest_proba)
print("Test ROC-AUC Score:", svmtest_roc_auc)

Test F1 Score: 0.9308350385574501
Test Accuracy: 0.9308383574109885
Test ROC-AUC Score: 0.9790935985355163




---



#**Logisitc Regression using custom Word2Vec Embeddings**

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import re



# Preprocess and tokenize the documents
def preprocess_text(text):
    # Basic cleaning - lowercasing, removing non-alphabetic characters
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    return tokens

# Tokenize each document in your corpus
tokenized_documents = [preprocess_text(reviews) for reviews in x_train]

# Train a Word2Vec model on the tokenized data
w2v_model = Word2Vec(
    sentences=tokenized_documents,
    vector_size=300,      # Dimensionality of the word vectors
    window=5,             # Context window size
    min_count=2,          # Ignores words with total frequency lower than this
    workers=4,            # Number of worker threads
    sg=1                  # Use skip-gram (1) instead of CBOW (0)
)

# Save the model for future use
w2v_model.save("custom_word2vec_model.model")

# Access the embedding for a specific word
word_vector = w2v_model.wv['car']  # Get the vector for the word 'car'

# Check the most similar words to a given word
similar_words = w2v_model.wv.most_similar('car', topn=5)
print(similar_words)

[('vehicle', 0.7477720379829407), ('vechile', 0.7187308669090271), ('carit', 0.7181493043899536), ('cari', 0.6730164885520935), ('itit', 0.6641161441802979)]


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import Word2Vec


# Custom transformer to average Word2Vec embeddings
class Word2VecAveraging(BaseEstimator, TransformerMixin):
    def __init__(self, w2v_model, vector_size=300):
        self.w2v_model = w2v_model
        self.vector_size = vector_size

    def transform(self, X, y=None):
        return np.array([self._average_word_vectors(text) for text in X])

    def fit(self, X, y=None):
        return self

    def _average_word_vectors(self, text):
        words = text.split()  # Tokenize or use a more advanced tokenizer
        word_vecs = [self.w2v_model.wv[word] for word in words if word in self.w2v_model.wv]

        # If no words in the text are in the Word2Vec vocab, return a zero vector
        if not word_vecs:
            return np.zeros(self.vector_size)

        # Average the word vectors
        return np.mean(word_vecs, axis=0)

# Assuming `w2v_model` is your trained Word2Vec model
w2v_model = Word2Vec.load("custom_word2vec_model.model")

# Define the pipeline
pipeline = Pipeline([
    ('word2vec_avg', Word2VecAveraging(w2v_model=w2v_model, vector_size=300)),
    ('model', LogisticRegression(max_iter=500, solver='saga'))
])

# Define parameter grid for GridSearchCV
param_grid = {
    'model__C': [0.01, 0.1, 1, 10, 100]  # Regularization strength for logistic regression
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    scoring='roc_auc',  # Or other metrics such as 'accuracy', 'f1_weighted'
    cv=3,               # 3-fold cross-validation
    verbose=2,
)

# Fit GridSearchCV on training data
grid_search.fit(x_train, y_train)

# Output the best parameters and score
print("Best parameters found:", grid_search.best_params_)
print("Best ROC AUC Score:", grid_search.best_score_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] END ......................................model__C=0.01; total time=  11.8s
[CV] END ......................................model__C=0.01; total time=  11.7s
[CV] END ......................................model__C=0.01; total time=  12.6s
[CV] END .......................................model__C=0.1; total time=  12.7s
[CV] END .......................................model__C=0.1; total time=  12.6s
[CV] END .......................................model__C=0.1; total time=  12.8s
[CV] END .........................................model__C=1; total time=  16.6s
[CV] END .........................................model__C=1; total time=  16.8s
[CV] END .........................................model__C=1; total time=  18.9s
[CV] END ........................................model__C=10; total time=  43.5s
[CV] END ........................................model__C=10; total time=  43.1s
[CV] END ........................................



[CV] END .......................................model__C=100; total time= 1.4min




[CV] END .......................................model__C=100; total time= 1.4min




[CV] END .......................................model__C=100; total time= 1.4min
Best parameters found: {'model__C': 10}
Best ROC AUC Score: 0.9546711668563103


In [None]:
w2v_best_model = grid_search.best_estimator_
y_w2vtest_pred = w2v_best_model.predict(x_test)
w2v_test_f1_score = f1_score(y_test, y_w2vtest_pred, average='weighted')
w2v_test_accuracy = accuracy_score(y_test, y_w2vtest_pred)

print("Test F1 Score:", w2v_test_f1_score)
print("Test Accuracy:", w2v_test_accuracy)

# Predict probabilities for the test set on the best_model
y_w2vtest_proba = w2v_best_model.decision_function(x_test)
# Calculate ROC-AUC score for the test sets
w2vtest_roc_auc = roc_auc_score(np.array(y_test), y_w2vtest_proba)
print("Test ROC-AUC Score:", w2vtest_roc_auc)

Test F1 Score: 0.9030757745154149
Test Accuracy: 0.9030826982140826
Test ROC-AUC Score: 0.955628382563383
