# Vader Lexicon + Random Forest

In [18]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.sentiment import SentimentIntensityAnalyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VaderSentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.sentiment.util import mark_negation
import nltk
import string
import re
import torch
from sklearn.metrics import f1_score, classification_report
from scipy.sparse import hstack
import optuna
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\aminm\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aminm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aminm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
#Load Datasets
train_balanced = pd.read_csv("C:/Users/aminm/OneDrive/Desktop/Uni MA Master/2. Semester/Web Mining/Web Mining Project/data/balanced/train_balanced.csv")
val_balanced = pd.read_csv("C:/Users/aminm/OneDrive/Desktop/Uni MA Master/2. Semester/Web Mining/Web Mining Project/data/balanced/val_balanced.csv")

train_stratified = pd.read_csv("C:/Users/aminm/OneDrive/Desktop/Uni MA Master/2. Semester/Web Mining/Web Mining Project/data/stratified/train_stratified.csv")
val_stratified = pd.read_csv("C:/Users/aminm/OneDrive/Desktop/Uni MA Master/2. Semester/Web Mining/Web Mining Project/data/stratified/val_stratified.csv")

test = pd.read_csv("C:/Users/aminm/OneDrive/Desktop/Uni MA Master/2. Semester/Web Mining/Web Mining Project/data/test.csv")

## Text Preprocessing

In [3]:
def remove_punctuation(text):
    text = str(text)
    punctiations = string.punctuation
    return text.translate(str.maketrans('', '', punctiations))

def remove_spec_char(text):
    text = str(text)
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

In [4]:
# lowercase words
train_balanced["full_review"] = train_balanced["full_review"].str.lower()
val_balanced["full_review"] = val_balanced["full_review"].str.lower()
train_stratified["full_review"] = train_stratified["full_review"].str.lower()
val_stratified["full_review"] = val_stratified["full_review"].str.lower()
test["full_review"] = test["full_review"].str.lower()
# remove punctuation,...
train_balanced["full_review"] = train_balanced["full_review"].apply(remove_punctuation)
val_balanced["full_review"] = val_balanced["full_review"].apply(remove_punctuation)
train_stratified["full_review"] = train_stratified["full_review"].apply(remove_punctuation)
val_stratified["full_review"] = val_stratified["full_review"].apply(remove_punctuation)
test["full_review"] = test["full_review"].apply(remove_punctuation)
# ...special characters,...
train_balanced["full_review"] = train_balanced["full_review"].apply(remove_spec_char)
val_balanced["full_review"] = val_balanced["full_review"].apply(remove_spec_char)
train_stratified["full_review"] = train_stratified["full_review"].apply(remove_spec_char)
val_stratified["full_review"] = val_stratified["full_review"].apply(remove_spec_char)
test["full_review"] = test["full_review"].apply(remove_spec_char)

In [5]:
# Extract features from review text using word frequency
vectorizer = CountVectorizer(stop_words=stopwords.words('english'))

X_train_b = vectorizer.fit_transform(train_balanced['full_review'])
X_valid_b = vectorizer.transform(val_balanced['full_review'])

X_train_s = vectorizer.fit_transform(train_stratified['full_review'])
X_valid_s = vectorizer.transform(val_stratified['full_review'])

X_test = vectorizer.transform(test['full_review'])

## Apply VADER
Valence Aware Dictionary for Sentiment Reasoning (VADER) is sensitive to both polarity (positive/negative) and intensity (strength) of emotion in texts. It takes a string and returns a dictionary of scores for each of these categories:
   - negative
   - neutral
   - positive
   - compund (computed by normalizing the scores above)

In [6]:
# Extract features from review text using sentiment lexicons
sia = VaderSentimentIntensityAnalyzer()

sentiment_scores_train_b = train_balanced['full_review'].apply(lambda x: sia.polarity_scores(x))
X_train_lexicon_b = pd.DataFrame(list(sentiment_scores_train_b))
sentiment_scores_valid_b = val_balanced['full_review'].apply(lambda x: sia.polarity_scores(x))
X_valid_lexicon_b = pd.DataFrame(list(sentiment_scores_valid_b))

sentiment_scores_train_s = train_stratified['full_review'].apply(lambda x: sia.polarity_scores(x))
X_train_lexicon_s = pd.DataFrame(list(sentiment_scores_train_s))
sentiment_scores_valid_s = val_stratified['full_review'].apply(lambda x: sia.polarity_scores(x))
X_valid_lexicon_s = pd.DataFrame(list(sentiment_scores_valid_s))

sentiment_scores_test = test['full_review'].apply(lambda x: sia.polarity_scores(x))
X_test_lexicon = pd.DataFrame(list(sentiment_scores_test))

In [7]:
X_train_lexicon_b.head()

Unnamed: 0,neg,neu,pos,compound
0,0.155,0.724,0.121,-0.2565
1,0.151,0.849,0.0,-0.6908
2,0.0,1.0,0.0,0.0
3,0.0,0.663,0.337,0.96
4,0.0,0.674,0.326,0.9468


## Add negations
The mark_negation class from the nltk-package highlights text parts which are negated in order to improve performance.

In [11]:
# Function to handle negation in a given sentence
def handle_negation(sentence):
    tokenized_sentence = word_tokenize(sentence)
    negated_words = mark_negation(tokenized_sentence)
    return ' '.join(negated_words)

In [12]:
# Apply negation handling to training, validation, and test sets
train_balanced['full_review'] = train_balanced['full_review'].apply(handle_negation)
val_balanced['ReviewText'] = val_balanced['full_review'].apply(handle_negation)

train_stratified['full_review'] = train_stratified['full_review'].apply(handle_negation)
val_stratified['ReviewText'] = val_stratified['full_review'].apply(handle_negation)

test['full_review'] = test['full_review'].apply(handle_negation)

In [25]:
train_balanced.head(10)

Unnamed: 0,full_review,star_rating
0,only for terminator enthusiasts i gave this ga...,2
1,worked for a few months and the kids tore it w...,2
2,didnt work_NEG but_NEG received_NEG a_NEG full...,2
3,really good graphics for a game from 2005 real...,4
4,was great until the band on top snapped in hal...,2
5,did not work_NEG im_NEG pissed_NEG,1
6,its cool i love thebr game,5
7,my first expirience with tb these headphones a...,2
8,this is nhl 14 with a different coverea youve ...,1
9,putting these on caused the analog stick to lo...,2


In [13]:
# Extract features from negation-handled review text using word frequency
X_train_negation_b = vectorizer.transform(train_balanced['full_review'])
X_valid_negation_b = vectorizer.transform(val_balanced['full_review'])

X_train_negation_s = vectorizer.transform(train_stratified['full_review'])
X_valid_negation_s = vectorizer.transform(val_stratified['full_review'])

X_test_negation = vectorizer.transform(test['full_review'])

## Combining features and creating target variables

In [14]:
# Combine features using sparse matrices
X_train_combined_b = hstack([X_train_lexicon_b, X_train_negation_b])
X_valid_combined_b = hstack([X_valid_lexicon_b, X_valid_negation_b])

X_train_combined_s = hstack([X_train_lexicon_s, X_train_negation_s])
X_valid_combined_s = hstack([X_valid_lexicon_s, X_valid_negation_s])

X_test_combined = hstack([X_test_lexicon, X_test_negation])


In [15]:
# Target variables
y_train_b = train_balanced['star_rating']
y_valid_b = val_balanced['star_rating']

y_train_s = train_stratified['star_rating']
y_valid_s = val_stratified['star_rating']

y_test = test['star_rating']

## Hyperparameter Tuning

In [17]:
def objective(trial, X_train, y_train, X_valid, y_valid):
    # Define the hyperparameters to tune
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'max_features': trial.suggest_int('max_features', 1, 5000)
    }

    # Train a Random Forest model with the current hyperparameters
    model = RandomForestClassifier(**params, n_jobs=-1)
    model.fit(X_train, y_train)

    # Predict on the validation set
    y_pred_valid = model.predict(X_valid)

    # Calculate the F1 score
    f1 = f1_score(y_valid, y_pred_valid, average='micro')

    return f1


study_s = optuna.create_study(direction='maximize')  # Maximize the F1 score for dataset X_train_combined_s
study_b = optuna.create_study(direction='maximize')  # Maximize the F1 score for dataset X_train_combined_b

# Optimize the objective function for dataset X_train_combined_s
study_s.optimize(lambda trial: objective(trial, X_train_combined_s, y_train_s, X_valid_combined_s, y_valid_s),
                 n_trials=20)

# Optimize the objective function for dataset X_train_combined_b
study_b.optimize(lambda trial: objective(trial, X_train_combined_b, y_train_b, X_valid_combined_b, y_valid_b),
                 n_trials=20)

# Print the best F1 score and the best set of hyperparameters for each dataset
print('Best F1 Score for X_train_combined_s:', study_s.best_value)
print('Best Hyperparameters for X_train_combined_s:', study_s.best_params)
print('Best F1 Score for X_train_combined_b:', study_b.best_value)
print('Best Hyperparameters for X_train_combined_b:', study_b.best_params)

[32m[I 2023-05-26 10:47:01,860][0m A new study created in memory with name: no-name-4a0355c3-8c05-40df-b2ec-2e1dcfb56a7d[0m
[32m[I 2023-05-26 10:47:01,862][0m A new study created in memory with name: no-name-a507b9a5-7455-46b7-b722-a9eab854b635[0m
[32m[I 2023-05-26 10:47:06,203][0m Trial 0 finished with value: 0.6965555555555556 and parameters: {'n_estimators': 331, 'max_depth': 6, 'max_features': 2448}. Best is trial 0 with value: 0.6965555555555556.[0m
[32m[I 2023-05-26 10:47:20,696][0m Trial 1 finished with value: 0.7055555555555556 and parameters: {'n_estimators': 296, 'max_depth': 17, 'max_features': 4523}. Best is trial 1 with value: 0.7055555555555556.[0m
[32m[I 2023-05-26 10:47:34,587][0m Trial 2 finished with value: 0.7082222222222222 and parameters: {'n_estimators': 202, 'max_depth': 26, 'max_features': 4360}. Best is trial 2 with value: 0.7082222222222222.[0m
[32m[I 2023-05-26 10:47:41,367][0m Trial 3 finished with value: 0.7031111111111111 and parameters: {

[32m[I 2023-05-26 10:54:50,750][0m Trial 18 finished with value: 0.46444444444444444 and parameters: {'n_estimators': 354, 'max_depth': 28, 'max_features': 1905}. Best is trial 14 with value: 0.4662222222222222.[0m
[32m[I 2023-05-26 10:54:55,250][0m Trial 19 finished with value: 0.45866666666666667 and parameters: {'n_estimators': 278, 'max_depth': 19, 'max_features': 692}. Best is trial 14 with value: 0.4662222222222222.[0m


Best F1 Score for X_train_combined_s: 0.7100000000000001
Best Hyperparameters for X_train_combined_s: {'n_estimators': 361, 'max_depth': 30, 'max_features': 2460}
Best F1 Score for X_train_combined_b: 0.4662222222222222
Best Hyperparameters for X_train_combined_b: {'n_estimators': 388, 'max_depth': 30, 'max_features': 942}


## Model evaluation
### Stratified Dataset

In [21]:
model_s = RandomForestClassifier(n_estimators = 361, max_depth = 30, max_features = 2460)
model_s.fit(X_train_combined_s, y_train_s)

y_pred_s = model_s.predict(X_test_combined)

print("F1-Score:", f1_score(y_test, y_pred_s, average="micro"))

print(classification_report(y_test, y_pred_s))

F1-Score: 0.7154873914541072
              precision    recall  f1-score   support

           1       0.52      0.65      0.58     15453
           2       0.17      0.00      0.01      6680
           3       0.42      0.07      0.12     11173
           4       0.37      0.03      0.06     22197
           5       0.75      0.97      0.85    109980

    accuracy                           0.72    165483
   macro avg       0.45      0.35      0.32    165483
weighted avg       0.63      0.72      0.63    165483



### Balanced Dataset

In [22]:
model_b = RandomForestClassifier(n_estimators = 388, max_depth = 30, max_features = 942)
model_b.fit(X_train_combined_b, y_train_b)

y_pred_b = model_b.predict(X_test_combined)

print("F1-Score:", f1_score(y_test, y_pred_b, average="micro"))

print(classification_report(y_test, y_pred_b))

F1-Score: 0.5799085102397226
              precision    recall  f1-score   support

           1       0.43      0.72      0.54     15453
           2       0.16      0.27      0.20      6680
           3       0.22      0.27      0.24     11173
           4       0.25      0.39      0.30     22197
           5       0.89      0.65      0.75    109980

    accuracy                           0.58    165483
   macro avg       0.39      0.46      0.41    165483
weighted avg       0.69      0.58      0.62    165483

