In [1]:
import nltk
import pandas as pd
from nltk.corpus import sentiwordnet as swn
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
import re
from sklearn.metrics import f1_score
import optuna

In [2]:
#Load Datasets
train_balanced = pd.read_csv("C:/Users/aminm/OneDrive/Desktop/Uni MA Master/2. Semester/Web Mining/Web Mining Project/data/balanced/train_balanced.csv")
val_balanced = pd.read_csv("C:/Users/aminm/OneDrive/Desktop/Uni MA Master/2. Semester/Web Mining/Web Mining Project/data/balanced/val_balanced.csv")

train_stratified = pd.read_csv("C:/Users/aminm/OneDrive/Desktop/Uni MA Master/2. Semester/Web Mining/Web Mining Project/data/stratified/train_stratified.csv")
val_stratified = pd.read_csv("C:/Users/aminm/OneDrive/Desktop/Uni MA Master/2. Semester/Web Mining/Web Mining Project/data/stratified/val_stratified.csv")

test = pd.read_csv("C:/Users/aminm/OneDrive/Desktop/Uni MA Master/2. Semester/Web Mining/Web Mining Project/data/test.csv")

In [3]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

stop_words = set(nltk.corpus.stopwords.words('english'))

def remove_spec_char(text):
    text = str(text)
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aminm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\aminm\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aminm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#apply remove_spec_char
train_balanced['full_review'] = train_balanced['full_review'].apply(remove_spec_char)
val_balanced['full_review'] = val_balanced['full_review'].apply(remove_spec_char)

train_stratified['full_review'] = train_stratified['full_review'].apply(remove_spec_char)
val_stratified['full_review'] = val_stratified['full_review'].apply(remove_spec_char)

test['full_review'] = test['full_review'].apply(remove_spec_char)

In [5]:
#apply preprocess_text
train_balanced['full_review'] = train_balanced['full_review'].apply(preprocess_text)
val_balanced['full_review'] = val_balanced['full_review'].apply(preprocess_text)

train_stratified['full_review'] = train_stratified['full_review'].apply(preprocess_text)
val_stratified['full_review'] = val_stratified['full_review'].apply(preprocess_text)

test['full_review'] = test['full_review'].apply(preprocess_text)

In [6]:
def get_sentiment_score(word):
    synsets = list(swn.senti_synsets(word))
    if not synsets:
        return 0
    return synsets[0].pos_score() - synsets[0].neg_score()

def calculate_sentiment_score(text):
    tokens = word_tokenize(text)
    scores = [get_sentiment_score(token) for token in tokens]
    return sum(scores)

In [7]:
train_balanced['sentiment_score'] = train_balanced['full_review'].apply(calculate_sentiment_score)
val_balanced['sentiment_score'] = val_balanced['full_review'].apply(calculate_sentiment_score)

train_stratified['sentiment_score'] = train_stratified['full_review'].apply(calculate_sentiment_score)
val_stratified['sentiment_score'] = val_stratified['full_review'].apply(calculate_sentiment_score)

test['sentiment_score'] = test['full_review'].apply(calculate_sentiment_score)

In [8]:
train_balanced.head()

Unnamed: 0,full_review,star_rating,sentiment_score
0,terminator enthusiasts gave game stars like te...,2,0.0
1,worked months kids tore worked months kids tor...,2,0.25
2,didnt work received full refund,2,0.0
3,really good graphics game really good graphics...,4,3.875
4,great band top snapped half great band top sna...,2,0.75


In [16]:
# Prepare the feature and target variables
X_train_bal = train_balanced["sentiment_score"].array.reshape(-1, 1)
y_train_bal = train_balanced["star_rating"]
X_val_bal = val_balanced["sentiment_score"].array.reshape(-1, 1)
y_val_bal = val_balanced["star_rating"]

X_train_strat = train_stratified["sentiment_score"].array.reshape(-1, 1)
y_train_strat = train_stratified["star_rating"]
X_val_strat = val_stratified["sentiment_score"].array.reshape(-1, 1)
y_val_strat = val_stratified["star_rating"]

X_test = test["sentiment_score"].array.reshape(-1, 1)
y_test = test["star_rating"]

In [12]:
#stratified

# Define the objective function for Optuna
def objective(trial):
    # Define the search space of hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    
    # Initialize the random forest classifier with the sampled hyperparameters
    clf_strat = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split
    )
    
    # Train the classifier on the training set
    clf_strat.fit(X_train_strat, y_train_strat)
    
    # Evaluate the classifier on the validation set
    y_pred_strat = clf_strat.predict(X_val_strat)
    f1 = f1_score(y_val_strat, y_pred_strat, average='micro')  # Calculate F1 score
    
    return f1


# Create the Optuna study and optimize the objective function
study_strat = optuna.create_study(direction='maximize')
study_strat.optimize(objective, n_trials=50)

[32m[I 2023-05-23 21:52:47,101][0m A new study created in memory with name: no-name-d1e8925f-35dc-4fb1-aea7-2ea87bad1841[0m
[32m[I 2023-05-23 21:52:48,625][0m Trial 0 finished with value: 0.6624444444444444 and parameters: {'n_estimators': 271, 'max_depth': 6, 'min_samples_split': 10}. Best is trial 0 with value: 0.6624444444444444.[0m
[32m[I 2023-05-23 21:52:50,194][0m Trial 1 finished with value: 0.6625555555555556 and parameters: {'n_estimators': 293, 'max_depth': 7, 'min_samples_split': 5}. Best is trial 1 with value: 0.6625555555555556.[0m
[32m[I 2023-05-23 21:52:51,790][0m Trial 2 finished with value: 0.6634444444444444 and parameters: {'n_estimators': 340, 'max_depth': 5, 'min_samples_split': 10}. Best is trial 2 with value: 0.6634444444444444.[0m
[32m[I 2023-05-23 21:52:53,720][0m Trial 3 finished with value: 0.664 and parameters: {'n_estimators': 481, 'max_depth': 3, 'min_samples_split': 5}. Best is trial 3 with value: 0.664.[0m
[32m[I 2023-05-23 21:52:54,385]

[32m[I 2023-05-23 21:54:38,679][0m Trial 40 finished with value: 0.6625555555555556 and parameters: {'n_estimators': 794, 'max_depth': 6, 'min_samples_split': 7}. Best is trial 19 with value: 0.6641111111111111.[0m
[32m[I 2023-05-23 21:54:41,516][0m Trial 41 finished with value: 0.664 and parameters: {'n_estimators': 655, 'max_depth': 4, 'min_samples_split': 5}. Best is trial 19 with value: 0.6641111111111111.[0m
[32m[I 2023-05-23 21:54:45,362][0m Trial 42 finished with value: 0.6636666666666666 and parameters: {'n_estimators': 841, 'max_depth': 5, 'min_samples_split': 5}. Best is trial 19 with value: 0.6641111111111111.[0m
[32m[I 2023-05-23 21:54:48,337][0m Trial 43 finished with value: 0.6641111111111111 and parameters: {'n_estimators': 672, 'max_depth': 4, 'min_samples_split': 6}. Best is trial 19 with value: 0.6641111111111111.[0m
[32m[I 2023-05-23 21:54:49,780][0m Trial 44 finished with value: 0.6637777777777778 and parameters: {'n_estimators': 346, 'max_depth': 4, '

In [17]:
# Get the best hyperparameters and retrain the model on the training set
best_params_strat = study_strat.best_params
best_clf_strat = RandomForestClassifier(**best_params_strat)
best_clf_strat.fit(X_train_strat, y_train_strat)

# Evaluate the final model on the test set
y_pred_test_strat = best_clf_strat.predict(X_test)
f1_test = f1_score(y_test, y_pred_test_strat, average='micro')
print("Final F1 score on test set:", f1_test)

Final F1 score on test set: 0.6674341170996416


In [18]:
#balanced

# Define the objective function for Optuna
def objective(trial):
    # Define the search space of hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    
    # Initialize the random forest classifier with the sampled hyperparameters
    clf_bal = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split
    )
    
    # Train the classifier on the training set
    clf_bal.fit(X_train_bal, y_train_bal)
    
    # Evaluate the classifier on the validation set
    y_pred_bal = clf_bal.predict(X_val_bal)
    f1 = f1_score(y_val_bal, y_pred_bal, average='micro')  # Calculate F1 score
    
    return f1


# Create the Optuna study and optimize the objective function
study_bal = optuna.create_study(direction='maximize')
study_bal.optimize(objective, n_trials=50)

[32m[I 2023-05-23 21:58:53,283][0m A new study created in memory with name: no-name-1ff20d24-6c7a-4b6d-9bcb-0408c3f0d476[0m
[32m[I 2023-05-23 21:58:56,543][0m Trial 0 finished with value: 0.27055555555555555 and parameters: {'n_estimators': 628, 'max_depth': 7, 'min_samples_split': 5}. Best is trial 0 with value: 0.27055555555555555.[0m
[32m[I 2023-05-23 21:59:01,253][0m Trial 1 finished with value: 0.26911111111111113 and parameters: {'n_estimators': 885, 'max_depth': 8, 'min_samples_split': 7}. Best is trial 0 with value: 0.27055555555555555.[0m
[32m[I 2023-05-23 21:59:05,872][0m Trial 2 finished with value: 0.2703333333333333 and parameters: {'n_estimators': 869, 'max_depth': 7, 'min_samples_split': 2}. Best is trial 0 with value: 0.27055555555555555.[0m
[32m[I 2023-05-23 21:59:08,223][0m Trial 3 finished with value: 0.27055555555555555 and parameters: {'n_estimators': 505, 'max_depth': 4, 'min_samples_split': 10}. Best is trial 0 with value: 0.27055555555555555.[0m


[32m[I 2023-05-23 22:00:18,886][0m Trial 39 finished with value: 0.273 and parameters: {'n_estimators': 155, 'max_depth': 5, 'min_samples_split': 8}. Best is trial 10 with value: 0.273.[0m
[32m[I 2023-05-23 22:00:19,817][0m Trial 40 finished with value: 0.2687777777777778 and parameters: {'n_estimators': 164, 'max_depth': 10, 'min_samples_split': 6}. Best is trial 10 with value: 0.273.[0m
[32m[I 2023-05-23 22:00:21,139][0m Trial 41 finished with value: 0.27244444444444443 and parameters: {'n_estimators': 239, 'max_depth': 5, 'min_samples_split': 8}. Best is trial 10 with value: 0.273.[0m
[32m[I 2023-05-23 22:00:22,383][0m Trial 42 finished with value: 0.272 and parameters: {'n_estimators': 230, 'max_depth': 5, 'min_samples_split': 10}. Best is trial 10 with value: 0.273.[0m
[32m[I 2023-05-23 22:00:23,185][0m Trial 43 finished with value: 0.27044444444444443 and parameters: {'n_estimators': 156, 'max_depth': 6, 'min_samples_split': 8}. Best is trial 10 with value: 0.273.[

In [21]:
# Get the best hyperparameters and retrain the model on the training set
best_params_bal = study_bal.best_params
best_clf_bal = RandomForestClassifier(**best_params_bal)
best_clf_bal.fit(X_train_bal, y_train_bal)

# Evaluate the final model on the test set
y_pred_test = best_clf_bal.predict(X_test)
f1_test = f1_score(y_test, y_pred_test, average='micro')
print("Final F1 score on test set:", f1_test)

Final F1 score on test set: 0.4514965283443012
