<a href="https://colab.research.google.com/github/femketenharkel/Predicting_Ratings/blob/main/models/K-Means_and_supervised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# K-Means clustering with supervised learning algorithms

The output of the K-Means clustering algorithm (together with the original dataset) is used as input for a Random Forest and an Extreme Gradient Boosting algorithm.

# **Random Forest**

In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from google.colab import drive
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score, accuracy_score, precision_score, recall_score

In [None]:
# Load in the data
drive.mount('/content/drive')
df_final = pd.read_csv('/content/drive/My Drive/Thesis/Data/df_final_K-Means_clustering4.csv')

df_final = df_final.astype(np.int32)

Mounted at /content/drive


In [None]:
# Define feature and target
X = df_final.drop(columns=['Rating'])
y = df_final['Rating']

# Split data 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_dist = {
    'n_estimators': [100, 200, 250, 300],
    'max_depth': [20, 30, 40, 50],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1, 2, 4]
}

# Define the inner and outer cross-validation strategies
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the scoring metric
f1_weighted_scorer = make_scorer(f1_score, average='weighted')


In [None]:
# Define evaluation functions
def evaluate_model(model, X, y, dataset_name=""):
    predictions = model.predict(X)
    acc = accuracy_score(y, predictions)
    prec = precision_score(y, predictions, average='weighted')
    rec = recall_score(y, predictions, average='weighted')
    f1 = f1_score(y, predictions, average='weighted')

    return acc, prec, rec, f1

def evaluate_model_print(model, X, y, dataset_name=""):
    predictions = model.predict(X)
    acc = accuracy_score(y, predictions)
    prec = precision_score(y, predictions, average='weighted')
    rec = recall_score(y, predictions, average='weighted')
    f1 = f1_score(y, predictions, average='weighted')

    print(f"{dataset_name} Set Performance:")
    print(f"Accuracy: {acc:.2f} ± {np.std([acc]):.2f}")
    print(f"Precision: {prec:.2f} ± {np.std([prec]):.2f}")
    print(f"Recall: {rec:.2f} ± {np.std([rec]):.2f}")
    print(f"F1 Score: {f1:.2f} ± {np.std([f1]):.2f}")

    return acc, prec, rec, f1

def print_average_performance(scores, dataset_name=""):
    scores = np.array(scores)
    print(f"{dataset_name} Set Performance (Averaged):")
    print(f"Accuracy: {scores[:, 0].mean():.2f} ± {scores[:, 0].std():.2f}")
    print(f"Precision: {scores[:, 1].mean():.2f} ± {scores[:, 1].std():.2f}")
    print(f"Recall: {scores[:, 2].mean():.2f} ± {scores[:, 2].std():.2f}")
    print(f"F1 Score: {scores[:, 3].mean():.2f} ± {scores[:, 3].std():.2f}")

def print_hyperparameter_performance(cv_results):
    print("Average F1 scores for all hyperparameters tried:")
    for params, mean_score, std_score in zip(cv_results['params'], cv_results['mean_test_score'], cv_results['std_test_score']):
        print(f"Parameters: {params}, Average F1 Score: {mean_score:.2f} ± {std_score:.2f}")

# Initialize the Randomized Search
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(class_weight='balanced', random_state=42),
    param_distributions=param_dist,
    n_iter=15,
    cv=inner_cv,
    scoring=f1_weighted_scorer
)

# Define output lists
outer_scores = []
val_scores = []
train_scores = []
best_params_list = []

# Perform nested CV
for train_idx, val_idx in outer_cv.split(X_train, y_train):
    # Split the data into training and validation sets
    X_train_fold = X_train.iloc[train_idx]
    X_val_fold = X_train.iloc[val_idx]
    y_train_fold = y_train.iloc[train_idx]
    y_val_fold = y_train.iloc[val_idx]

    # Fit the model on the training fold
    random_search.fit(X_train_fold, y_train_fold)
    best_model = random_search.best_estimator_

    # Save the best parameters for this fold
    best_params_list.append(random_search.best_params_)

    # Evaluate on the training set
    train_scores.append(evaluate_model(best_model, X_train_fold, y_train_fold, "Training"))

    # Evaluate on the validation set
    val_scores.append(evaluate_model(best_model, X_val_fold, y_val_fold, "Validation"))

    # Store the outer fold score (f1_weighted)
    outer_scores.append(random_search.best_score_)

# Print the performance of the nested CV
print(f"Nested Cross-Validation F1 Score: {np.mean(outer_scores):.2f} ± {np.std(outer_scores):.2f}")
print("")
print("Average metrics for all folds:")
# Print averaged training set performance for each fold
print_average_performance(train_scores, "Training")
# Print averaged validation set performance for each fold
print_average_performance(val_scores, "Validation")

# Print the best parameters found during hyperparameter tuning
print("Best parameters found during hyperparameter tuning:")
for i, params in enumerate(best_params_list):
    print(f"Fold {i+1}: {params}")

# Print average F1 scores for all hyperparameters tried
print_hyperparameter_performance(random_search.cv_results_)

# Define the best parameters found in nested CV
best_params = best_params_list[np.argmax(outer_scores)]

# Train the best model with class_weight='balanced'
best_model = RandomForestClassifier(**best_params, class_weight='balanced', random_state=42)
best_model.fit(X_train, y_train)

print("Evaluation of best performing model:")
# Evaluate on the training set
evaluate_model_print(best_model, X_train, y_train, "Training")
# Evaluate on the test set
evaluate_model_print(best_model, X_test, y_test, "Test")

# Print the parameters for the best-performing model
print("Parameters for the best performing model:")
print(best_params)

# Save the best performing model
joblib_file = "/content/drive/My Drive/Thesis/Models/K-Means_RF.pkl"
joblib.dump(best_model, joblib_file)

  _data = np.array(data, dtype=dtype, copy=copy,


Nested Cross-Validation F1 Score: 0.40 ± 0.00

Average metrics for all folds:
Training Set Performance (Averaged):
Accuracy: 0.98 ± 0.01
Precision: 0.98 ± 0.01
Recall: 0.98 ± 0.01
F1 Score: 0.98 ± 0.01
Validation Set Performance (Averaged):
Accuracy: 0.41 ± 0.00
Precision: 0.40 ± 0.00
Recall: 0.41 ± 0.00
F1 Score: 0.40 ± 0.00
Best parameters found during hyperparameter tuning:
Fold 1: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 30}
Fold 2: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 30}
Fold 3: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 30}
Fold 4: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 30}
Fold 5: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 40}
Average F1 scores for all hyperparameters tried:
Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 30}, Av

['/content/drive/My Drive/Thesis/Models/K-Means_RF.pkl']

# **Extreme Gradient Boosting**

In [None]:
pip install xgboost



In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
from sklearn.model_selection import KFold, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer

In [None]:
# Load in the data
drive.mount('/content/drive')
df_final = pd.read_csv('/content/drive/My Drive/Thesis/Data/df_final_K-Means_clustering4.csv')

df_final = df_final.astype(np.int32)

Mounted at /content/drive


In [None]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the "Rating" column
df_final['Rating'] = label_encoder.fit_transform(df_final['Rating'])

In [None]:
# Define feature and target
X = df_final.drop(columns=['Rating'])
y = df_final['Rating']

# Split data 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=42)

# Define the parameter grid for tuning
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
}

# Define the inner and outer cross-validation strategies
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the scoring metric
f1_weighted_scorer = make_scorer(f1_score, average='weighted')

In [None]:
# Define evaluation functions
def evaluate_model(model, X, y, dataset_name=""):
    predictions = model.predict(X)
    acc = accuracy_score(y, predictions)
    prec = precision_score(y, predictions, average='weighted')
    rec = recall_score(y, predictions, average='weighted')
    f1 = f1_score(y, predictions, average='weighted')

    return acc, prec, rec, f1

def evaluate_model_print(model, X, y, dataset_name=""):
    predictions = model.predict(X)
    acc = accuracy_score(y, predictions)
    prec = precision_score(y, predictions, average='weighted')
    rec = recall_score(y, predictions, average='weighted')
    f1 = f1_score(y, predictions, average='weighted')

    print(f"{dataset_name} Set Performance:")
    print(f"Accuracy: {acc:.2f} ± {np.std([acc]):.2f}")
    print(f"Precision: {prec:.2f} ± {np.std([prec]):.2f}")
    print(f"Recall: {rec:.2f} ± {np.std([rec]):.2f}")
    print(f"F1 Score: {f1:.2f} ± {np.std([f1]):.2f}")

    return acc, prec, rec, f1

def print_average_performance(scores, dataset_name=""):
    scores = np.array(scores)
    print(f"{dataset_name} Set Performance (Averaged):")
    print(f"Accuracy: {scores[:, 0].mean():.2f} ± {scores[:, 0].std():.2f}")
    print(f"Precision: {scores[:, 1].mean():.2f} ± {scores[:, 1].std():.2f}")
    print(f"Recall: {scores[:, 2].mean():.2f} ± {scores[:, 2].std():.2f}")
    print(f"F1 Score: {scores[:, 3].mean():.2f} ± {scores[:, 3].std():.2f}")

def print_hyperparameter_performance(cv_results):
    print("Average F1 scores for all hyperparameters tried:")
    for params, mean_score, std_score in zip(cv_results['params'], cv_results['mean_test_score'], cv_results['std_test_score']):
        print(f"Parameters: {params}, Average F1 Score: {mean_score:.2f} ± {std_score:.2f}")

# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=XGBClassifier(tree_method='hist', device='cuda', eval_metric='merror'),
    param_distributions=param_dist,
    n_iter=15,
    cv=inner_cv,
    scoring=f1_weighted_scorer,
    n_jobs=-1
)

# Define output lists
outer_scores = []
val_scores = []
train_scores = []
best_params_list = []

# Perform nested CV on the training data
for train_idx, val_idx in outer_cv.split(X_train, y_train):
    # Split the data into training and validation sets
    X_train_fold = X_train.iloc[train_idx]
    X_val_fold = X_train.iloc[val_idx]
    y_train_fold = y_train.iloc[train_idx]
    y_val_fold = y_train.iloc[val_idx]

    # Fit the model on the training fold
    random_search.fit(X_train_fold, y_train_fold)
    best_model = random_search.best_estimator_

    # Save the best parameters for this fold
    best_params_list.append(random_search.best_params_)

    # Evaluate on the training set
    train_scores.append(evaluate_model(best_model, X_train_fold, y_train_fold, "Training"))

    # Evaluate on the validation set
    val_scores.append(evaluate_model(best_model, X_val_fold, y_val_fold, "Validation"))

    # Store the outer fold score (f1_weighted)
    outer_scores.append(random_search.best_score_)

# Print the performance of the nested CV
print(f"Nested Cross-Validation F1 Score: {np.mean(outer_scores):.2f} ± {np.std(outer_scores):.2f}")
print("")
print("Average metrics for all folds:")
# Print averaged training set performance for each fold
print_average_performance(train_scores, "Training")
# Print averaged validation set performance for each fold
print_average_performance(val_scores, "Validation")

# Print the best parameters found during hyperparameter tuning
print("Best parameters found during hyperparameter tuning:")
for i, params in enumerate(best_params_list):
    print(f"Fold {i+1}: {params}")

# Print average F1 scores for all hyperparameters tried
print_hyperparameter_performance(random_search.cv_results_)

# Define the best parameters found in nested CV
best_params = best_params_list[np.argmax(outer_scores)]

# Train best model on the entire training data
best_model = XGBClassifier(**best_params, tree_method='hist', device='cuda', eval_metric='merror', random_state=42)
best_model.fit(X_train, y_train)

print("Evaluation of best performing model:")
# Evaluate on the training set
evaluate_model_print(best_model, X_train, y_train, "Training")
# Evaluate on the test set
evaluate_model_print(best_model, X_test, y_test, "Test")

# Print the parameters for the best-performing model
print("Parameters for the best performing model:")
print(best_params)

joblib.dump(best_model, "/content/drive/My Drive/Thesis/Models/K-Means_XGB.pkl")

Nested Cross-Validation F1 Score: 0.41 ± 0.00

Average metrics for all folds:
Training Set Performance (Averaged):
Accuracy: 0.60 ± 0.03
Precision: 0.63 ± 0.03
Recall: 0.60 ± 0.03
F1 Score: 0.59 ± 0.03
Validation Set Performance (Averaged):
Accuracy: 0.43 ± 0.00
Precision: 0.43 ± 0.00
Recall: 0.43 ± 0.00
F1 Score: 0.42 ± 0.00
Best parameters found during hyperparameter tuning:
Fold 1: {'n_estimators': 400, 'max_depth': 7, 'learning_rate': 0.1}
Fold 2: {'n_estimators': 400, 'max_depth': 7, 'learning_rate': 0.2}
Fold 3: {'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.2}
Fold 4: {'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.2}
Fold 5: {'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.2}
Average F1 scores for all hyperparameters tried:
Parameters: {'n_estimators': 400, 'max_depth': 7, 'learning_rate': 0.05}, Average F1 Score: 0.39 ± 0.00
Parameters: {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1}, Average F1 Score: 0.36 ± 0.00
Parameters: {'n_estimato

['/content/drive/My Drive/Thesis/Models/K-Means_XGB.pkl']

# Submission 2

## Optuna

In [2]:
pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  Downloading SQLAlchemy-2.0.36-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet!=0.4.17 (from sqlalchemy>=1.4.2->optuna)
  Downloading greenlet-3.1.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23

In [3]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from google.colab import drive
import optuna
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score, accuracy_score, precision_score, recall_score

In [4]:
# Load in the data
drive.mount('/content/drive')
df_final = pd.read_csv('/content/drive/My Drive/Thesis/Data/df_final_K-Means_clustering4.csv')

df_final = df_final.astype(np.int32)

Mounted at /content/drive


In [5]:
# Define feature and target
X = df_final.drop(columns=['Rating'])
y = df_final['Rating']

# Split data 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_dist = {
    'n_estimators': [100, 200, 250, 300],
    'max_depth': [20, 30, 40, 50],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1, 2, 4]
}

# Define the inner and outer cross-validation strategies
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the scoring metric
f1_weighted_scorer = make_scorer(f1_score, average='weighted')


In [7]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import joblib
import numpy as np

# Define feature and target
X = df_final.drop(columns=['Rating'])
y = df_final['Rating']

# Split data 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Define evaluation functions
def evaluate_model(model, X, y, dataset_name=""):
    predictions = model.predict(X)
    acc = accuracy_score(y, predictions)
    prec = precision_score(y, predictions, average='weighted')
    rec = recall_score(y, predictions, average='weighted')
    f1 = f1_score(y, predictions, average='weighted')

    return acc, prec, rec, f1

def evaluate_model_print(model, X, y, dataset_name=""):
    predictions = model.predict(X)
    acc = accuracy_score(y, predictions)
    prec = precision_score(y, predictions, average='weighted')
    rec = recall_score(y, predictions, average='weighted')
    f1 = f1_score(y, predictions, average='weighted')

    print(f"{dataset_name} Set Performance:")
    print(f"Accuracy: {acc:.2f} ± {np.std([acc]):.2f}")
    print(f"Precision: {prec:.2f} ± {np.std([prec]):.2f}")
    print(f"Recall: {rec:.2f} ± {np.std([rec]):.2f}")
    print(f"F1 Score: {f1:.2f} ± {np.std([f1]):.2f}")

    return acc, prec, rec, f1

def print_average_performance(scores, dataset_name=""):
    scores = np.array(scores)
    print(f"{dataset_name} Set Performance (Averaged):")
    print(f"Accuracy: {scores[:, 0].mean():.2f} ± {scores[:, 0].std():.2f}")
    print(f"Precision: {scores[:, 1].mean():.2f} ± {scores[:, 1].std():.2f}")
    print(f"Recall: {scores[:, 2].mean():.2f} ± {scores[:, 2].std():.2f}")
    print(f"F1 Score: {scores[:, 3].mean():.2f} ± {scores[:, 3].std():.2f}")

# Define the objective function for Optuna
def objective(trial):
    param_dist = {
        'n_estimators': trial.suggest_categorical('n_estimators', [100, 200, 250, 300]),
        'max_depth': trial.suggest_int('max_depth', 20, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4)
    }

    model = RandomForestClassifier(class_weight='balanced', random_state=42, **param_dist)
    inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
    f1_scores = cross_val_score(model, X_train_fold, y_train_fold, cv=inner_cv, scoring='f1_weighted')
    return np.mean(f1_scores)

# Define the outer cross-validation strategy
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define output lists
outer_scores = []
val_scores = []
train_scores = []
best_params_list = []
best_params_scores_list = []

# Perform nested CV
for train_idx, val_idx in outer_cv.split(X_train, y_train):
    # Split the data into training and validation sets
    X_train_fold = X_train.iloc[train_idx]
    X_val_fold = X_train.iloc[val_idx]
    y_train_fold = y_train.iloc[train_idx]
    y_val_fold = y_train.iloc[val_idx]

    # Create a study and optimize the objective function for the current fold
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)

    # Get the best hyperparameters for the current fold
    best_params = study.best_params

    # Save the best parameters and their scores for this fold
    best_params_list.append(best_params)
    best_params_scores_list.append((best_params, study.best_value))

    # Train the best model with the best hyperparameters for the current fold
    best_model = RandomForestClassifier(**best_params, class_weight='balanced', random_state=42)
    best_model.fit(X_train_fold, y_train_fold)

    # Evaluate on the training set
    train_scores.append(evaluate_model(best_model, X_train_fold, y_train_fold))

    # Evaluate on the validation set
    val_scores.append(evaluate_model(best_model, X_val_fold, y_val_fold))

    # Store the outer fold score (f1_weighted)
    outer_scores.append(study.best_value)

# Print the performance of the nested CV
print(f"Nested Cross-Validation F1 Score: {np.mean(outer_scores):.2f} ± {np.std(outer_scores):.2f}")
print("")
print("Average metrics for all folds:")
# Print averaged training set performance for each fold
print_average_performance(train_scores, "Training")
# Print averaged validation set performance for each fold
print_average_performance(val_scores, "Validation")

# Print the best parameters found during hyperparameter tuning along with their scores
print("Best parameters found during hyperparameter tuning along with their scores:")
for i in range(len(best_params_scores_list)):
    params, score = best_params_scores_list[i]
    print(f"Fold {i+1}: Parameters: {params}, Score: {score:.2f}")

# Define the best parameters found in nested CV
best_params = best_params_list[np.argmax(outer_scores)]

# Train the best model with class_weight='balanced'
best_model = RandomForestClassifier(**best_params, class_weight='balanced', random_state=42)
best_model.fit(X_train, y_train)

print("Evaluation of best performing model:")
# Evaluate on the training set
evaluate_model_print(best_model, X_train, y_train, "Training")
# Evaluate on the test set
evaluate_model_print(best_model, X_test, y_test, "Test")

# Save the best performing model
joblib_file = "/content/drive/My Drive/Thesis/Models/K-Means_RF_sub2.pkl"
joblib.dump(best_model, joblib_file)

[I 2024-12-23 08:14:10,002] A new study created in memory with name: no-name-3d5e44a2-4411-4447-8d76-edb20202d98d
[W 2024-12-23 08:18:55,994] Trial 0 failed with parameters: {'n_estimators': 250, 'max_depth': 40, 'min_samples_split': 5, 'min_samples_leaf': 2} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-7-57884196f266>", line 59, in objective
    f1_scores = cross_val_score(model, X_train_fold, y_train_fold, cv=inner_cv, scoring='f1_weighted')
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 216, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 684, in cross_val_score
    cv_results = cross_validate(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/

KeyboardInterrupt: 

## Successive Halving Search

Random forest

In [8]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from google.colab import drive
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score, accuracy_score, precision_score, recall_score

In [9]:
# Load in the data
drive.mount('/content/drive')
df_final = pd.read_csv('/content/drive/My Drive/Thesis/Data/df_final_K-Means_clustering4.csv')

df_final = df_final.astype(np.int32)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
# Define feature and target
X = df_final.drop(columns=['Rating'])
y = df_final['Rating']

# Split data 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_dist = {
    'n_estimators': [100, 200, 250, 300],
    'max_depth': [20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Define the inner and outer cross-validation strategies
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the scoring metric
f1_weighted_scorer = make_scorer(f1_score, average='weighted')


In [17]:
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.model_selection import train_test_split, KFold
import joblib
import numpy as np

# Define evaluation functions
def evaluate_model(model, X, y, dataset_name=""):
    predictions = model.predict(X)
    acc = accuracy_score(y, predictions)
    prec = precision_score(y, predictions, average='weighted')
    rec = recall_score(y, predictions, average='weighted')
    f1 = f1_score(y, predictions, average='weighted')

    return acc, prec, rec, f1

def evaluate_model_print(model, X, y, dataset_name=""):
    predictions = model.predict(X)
    acc = accuracy_score(y, predictions)
    prec = precision_score(y, predictions, average='weighted')
    rec = recall_score(y, predictions, average='weighted')
    f1 = f1_score(y, predictions, average='weighted')

    print(f"{dataset_name} Set Performance:")
    print(f"Accuracy: {acc:.2f} ± {np.std([acc]):.2f}")
    print(f"Precision: {prec:.2f} ± {np.std([prec]):.2f}")
    print(f"Recall: {rec:.2f} ± {np.std([rec]):.2f}")
    print(f"F1 Score: {f1:.2f} ± {np.std([f1]):.2f}")

    return acc, prec, rec, f1

def print_average_performance(scores, dataset_name=""):
    scores = np.array(scores)
    print(f"{dataset_name} Set Performance (Averaged):")
    print(f"Accuracy: {scores[:, 0].mean():.2f} ± {scores[:, 0].std():.2f}")
    print(f"Precision: {scores[:, 1].mean():.2f} ± {scores[:, 1].std():.2f}")
    print(f"Recall: {scores[:, 2].mean():.2f} ± {scores[:, 2].std():.2f}")
    print(f"F1 Score: {scores[:, 3].mean():.2f} ± {scores[:, 3].std():.2f}")

def print_hyperparameter_performance(cv_results):
    print("Average F1 scores for all hyperparameters tried:")
    for params, mean_score, std_score in zip(cv_results['params'], cv_results['mean_test_score'], cv_results['std_test_score']):
        print(f"Parameters: {params}, Average F1 Score: {mean_score:.2f} ± {std_score:.2f}")

# Initialize the Successive Halving Search
halving_search = HalvingRandomSearchCV(
    estimator=RandomForestClassifier(class_weight='balanced', random_state=42),
    param_distributions=param_dist,
    factor=3,
    resource='max_samples',
    max_resources=len(X_train),
    min_resources=100,
    cv=inner_cv,
    scoring=f1_weighted_scorer,
    random_state=42
)

# Define output lists
outer_scores = []
val_scores = []
train_scores = []
best_params_list = []

# Perform nested CV
for train_idx, val_idx in outer_cv.split(X_train, y_train):
    # Split the data into training and validation sets
    X_train_fold = X_train.iloc[train_idx]
    X_val_fold = X_train.iloc[val_idx]
    y_train_fold = y_train.iloc[train_idx]
    y_val_fold = y_train.iloc[val_idx]

    # Fit the model on the training fold
    halving_search.fit(X_train_fold, y_train_fold)
    best_model = halving_search.best_estimator_

    # Save the best parameters for this fold
    best_params_list.append(halving_search.best_params_)

    # Evaluate on the training set
    train_scores.append(evaluate_model(best_model, X_train_fold, y_train_fold))

    # Evaluate on the validation set
    val_scores.append(evaluate_model(best_model, X_val_fold, y_val_fold))

    # Store the outer fold score (f1_weighted)
    outer_scores.append(halving_search.best_score_)

# Print the performance of the nested CV
print(f"Nested Cross-Validation F1 Score: {np.mean(outer_scores):.2f} ± {np.std(outer_scores):.2f}")
print("")
print("Average metrics for all folds:")
# Print averaged training set performance for each fold
print_average_performance(train_scores, "Training")
# Print averaged validation set performance for each fold
print_average_performance(val_scores, "Validation")

# Print the best parameters found during hyperparameter tuning
print("Best parameters found during hyperparameter tuning:")
for i in range(len(best_params_list)):
    params = best_params_list[i]
    print(f"Fold {i+1}: Parameters: {params}")

# Print average F1 scores for all hyperparameters tried
print_hyperparameter_performance(halving_search.cv_results_)

# Define the best parameters found in nested CV
best_params = best_params_list[np.argmax(outer_scores)]

# Train the best model with class_weight='balanced'
best_model = RandomForestClassifier(**best_params, class_weight='balanced', random_state=42)
best_model.fit(X_train, y_train)

print("Evaluation of best performing model:")
# Evaluate on the training set
evaluate_model_print(best_model, X_train, y_train, "Training")
# Evaluate on the test set
evaluate_model_print(best_model, X_test, y_test, "Test")

# Save the best performing model
joblib_file = "/content/drive/My Drive/Thesis/Models/K-Means_RF.pkl"
joblib.dump(best_model, joblib_file)



Nested Cross-Validation F1 Score: 0.37 ± 0.00

Average metrics for all folds:
Training Set Performance (Averaged):
Accuracy: 0.43 ± 0.01
Precision: 0.43 ± 0.01
Recall: 0.43 ± 0.01
F1 Score: 0.43 ± 0.01
Validation Set Performance (Averaged):
Accuracy: 0.38 ± 0.00
Precision: 0.37 ± 0.00
Recall: 0.38 ± 0.00
F1 Score: 0.37 ± 0.00
Best parameters found during hyperparameter tuning:
Fold 1: Parameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 40, 'max_samples': 8100}
Fold 2: Parameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 40, 'max_samples': 8100}
Fold 3: Parameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 40, 'max_samples': 8100}
Fold 4: Parameters: {'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 30, 'max_samples': 8100}
Fold 5: Parameters: {'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 30, 'max

['/content/drive/My Drive/Thesis/Models/K-Means_RF.pkl']