# **Random Forest baseline complete**

In [None]:
import numpy as np
import pandas as pd
import joblib
from google.colab import drive
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Load df
drive.mount('/content/drive')
df_final = pd.read_csv("/content/drive/My Drive/Thesis/Data/df_final_2.csv")

df_final = df_final.astype(np.int32)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Define feature and target
X = df_final.drop(columns=['Rating'])
y = df_final['Rating']

# Split data 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_dist = {
    'n_estimators': [100, 200, 250, 300],
    'max_depth': [20, 30, 40, 50],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1, 2, 4]
}

# Define the inner and outer cross-validation
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the scoring metric
f1_weighted_scorer = make_scorer(f1_score, average='weighted')


In [None]:
# Define evaluation functions
def evaluate_model(model, X, y, dataset_name=""):
    predictions = model.predict(X)
    acc = accuracy_score(y, predictions)
    prec = precision_score(y, predictions, average='weighted')
    rec = recall_score(y, predictions, average='weighted')
    f1 = f1_score(y, predictions, average='weighted')

    return acc, prec, rec, f1

def evaluate_model_print(model, X, y, dataset_name=""):
    predictions = model.predict(X)
    acc = accuracy_score(y, predictions)
    prec = precision_score(y, predictions, average='weighted')
    rec = recall_score(y, predictions, average='weighted')
    f1 = f1_score(y, predictions, average='weighted')

    print(f"{dataset_name} Set Performance:")
    print(f"Accuracy: {acc:.2f} ± {np.std([acc]):.2f}")
    print(f"Precision: {prec:.2f} ± {np.std([prec]):.2f}")
    print(f"Recall: {rec:.2f} ± {np.std([rec]):.2f}")
    print(f"F1 Score: {f1:.2f} ± {np.std([f1]):.2f}")

    return acc, prec, rec, f1

def print_average_performance(scores, dataset_name=""):
    scores = np.array(scores)
    print(f"{dataset_name} Set Performance (Averaged):")
    print(f"Accuracy: {scores[:, 0].mean():.2f} ± {scores[:, 0].std():.2f}")
    print(f"Precision: {scores[:, 1].mean():.2f} ± {scores[:, 1].std():.2f}")
    print(f"Recall: {scores[:, 2].mean():.2f} ± {scores[:, 2].std():.2f}")
    print(f"F1 Score: {scores[:, 3].mean():.2f} ± {scores[:, 3].std():.2f}")

def print_hyperparameter_performance(cv_results):
    print("Average F1 scores for all hyperparameters tried:")
    for params, mean_score, std_score in zip(cv_results['params'], cv_results['mean_test_score'], cv_results['std_test_score']):
        print(f"Parameters: {params}, Average F1 Score: {mean_score:.2f} ± {std_score:.2f}")

# Initialize the Randomized Search with class_weight='balanced'
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(class_weight='balanced', random_state=42),
    param_distributions=param_dist,
    n_iter=15,
    cv=inner_cv,
    scoring=f1_weighted_scorer
)

# Define output lists
outer_scores = []
val_scores = []
train_scores = []
best_params_list = []

# Perform nested CV
for train_idx, val_idx in outer_cv.split(X_train, y_train):
    # Split the data into training and validation sets
    X_train_fold = X_train.iloc[train_idx]
    X_val_fold = X_train.iloc[val_idx]
    y_train_fold = y_train.iloc[train_idx]
    y_val_fold = y_train.iloc[val_idx]

    # Fit the model on the training fold
    random_search.fit(X_train_fold, y_train_fold)
    best_model = random_search.best_estimator_

    # Save the best parameters for this fold
    best_params_list.append(random_search.best_params_)

    # Evaluate on the training set
    train_scores.append(evaluate_model(best_model, X_train_fold, y_train_fold, "Training"))

    # Evaluate on the validation set
    val_scores.append(evaluate_model(best_model, X_val_fold, y_val_fold, "Validation"))

    # Store the outer fold score (f1_weighted)
    outer_scores.append(random_search.best_score_)

# Print the performance of the nested CV
print(f"Nested Cross-Validation F1 Score: {np.mean(outer_scores):.2f} ± {np.std(outer_scores):.2f}")
print("")
print("Average metrics for all folds:")
# Print averaged training set performance for each fold
print_average_performance(train_scores, "Training")
# Print averaged validation set performance for each fold
print_average_performance(val_scores, "Validation")

# Print the best parameters found during hyperparameter tuning
print("Best parameters found during hyperparameter tuning:")
for i, params in enumerate(best_params_list):
    print(f"Fold {i+1}: {params}")

# Print average F1 scores for all hyperparameters tried
print_hyperparameter_performance(random_search.cv_results_)

# Define the best parameters found in nested CV
best_params = best_params_list[np.argmax(outer_scores)]

# Train the best model with class_weight='balanced'
best_model = RandomForestClassifier(**best_params, class_weight='balanced', random_state=42)
best_model.fit(X_train, y_train)

print("Evaluation of best performing model:")
# Evaluate on the training set
evaluate_model_print(best_model, X_train, y_train, "Training")
# Evaluate on the test set
evaluate_model_print(best_model, X_test, y_test, "Test")

# Print the parameters for the best-performing model
print("Parameters for the best performing model:")
print(best_params)

# Save the best performing model
joblib_file = "/content/drive/My Drive/Thesis/Models/Baseline_RF.pkl"
joblib.dump(best_model, joblib_file)

  _data = np.array(data, dtype=dtype, copy=copy,


Nested Cross-Validation F1 Score: 0.40 ± 0.00

Average metrics for all folds:
Training Set Performance (Averaged):
Accuracy: 0.98 ± 0.01
Precision: 0.98 ± 0.01
Recall: 0.98 ± 0.01
F1 Score: 0.98 ± 0.01
Validation Set Performance (Averaged):
Accuracy: 0.41 ± 0.00
Precision: 0.40 ± 0.00
Recall: 0.41 ± 0.00
F1 Score: 0.40 ± 0.00
Best parameters found during hyperparameter tuning:
Fold 1: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 30}
Fold 2: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30}
Fold 3: {'n_estimators': 250, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 40}
Fold 4: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 30}
Fold 5: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 50}
Average F1 scores for all hyperparameters tried:
Parameters: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 20}, A

['/content/drive/My Drive/Thesis/Models/Baseline_RF.pkl']

# **XGBoost baseline, met training output**

In [None]:
pip install xgboost



In [None]:
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import KFold, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer

In [None]:
# Load in the data
drive.mount('/content/drive')
df_final = pd.read_csv('/content/drive/My Drive/Thesis/Data/df_final_2.csv')
df_final = df_final.astype(np.int32)

Mounted at /content/drive


In [None]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the "Rating" column
df_final['Rating'] = label_encoder.fit_transform(df_final['Rating'])

In [None]:
# Define feature and target
X = df_final.drop(columns=['Rating'])
y = df_final['Rating']

# Split data 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
}

# Define the inner and outer cross-validation strategies
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the scoring metric
f1_weighted_scorer = make_scorer(f1_score, average='weighted')

In [None]:
# Define evaluation functions
def evaluate_model(model, X, y, dataset_name=""):
    predictions = model.predict(X)
    acc = accuracy_score(y, predictions)
    prec = precision_score(y, predictions, average='weighted')
    rec = recall_score(y, predictions, average='weighted')
    f1 = f1_score(y, predictions, average='weighted')

    return acc, prec, rec, f1

def evaluate_model_print(model, X, y, dataset_name=""):
    predictions = model.predict(X)
    acc = accuracy_score(y, predictions)
    prec = precision_score(y, predictions, average='weighted')
    rec = recall_score(y, predictions, average='weighted')
    f1 = f1_score(y, predictions, average='weighted')

    print(f"{dataset_name} Set Performance:")
    print(f"Accuracy: {acc:.2f}")
    print(f"Precision: {prec:.2f}")
    print(f"Recall: {rec:.2f}")
    print(f"F1 Score: {f1:.2f}")

    return acc, prec, rec, f1

def print_average_performance(scores, dataset_name=""):
    scores = np.array(scores)
    print(f"{dataset_name} Set Performance (Averaged):")
    print(f"Accuracy: {scores[:, 0].mean():.2f} ± {scores[:, 0].std():.2f}")
    print(f"Precision: {scores[:, 1].mean():.2f} ± {scores[:, 1].std():.2f}")
    print(f"Recall: {scores[:, 2].mean():.2f} ± {scores[:, 2].std():.2f}")
    print(f"F1 Score: {scores[:, 3].mean():.2f} ± {scores[:, 3].std():.2f}")

def print_hyperparameter_performance(cv_results):
    print("Average F1 scores for all hyperparameters tried:")
    for params, mean_score in zip(cv_results['params'], cv_results['mean_test_score']):
        print(f"Parameters: {params}, Average F1 Score: {mean_score:.2f}")

# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=XGBClassifier(tree_method='hist', device='cuda', eval_metric='merror'),
    param_distributions=param_dist,
    n_iter=15,
    cv=inner_cv,
    scoring=f1_weighted_scorer,
    n_jobs=-1
)

# Define output lists
outer_scores = []
val_scores = []
train_scores = []
best_params_list = []

# Perform nested CV on the training data
for train_idx, val_idx in outer_cv.split(X_train, y_train):
    # Split the data into training and validation sets
    X_train_fold = X_train.iloc[train_idx]
    X_val_fold = X_train.iloc[val_idx]
    y_train_fold = y_train.iloc[train_idx]
    y_val_fold = y_train.iloc[val_idx]

    # Fit the model on the training fold
    random_search.fit(X_train_fold, y_train_fold)
    best_model = random_search.best_estimator_

    # Save the best parameters for this fold
    best_params_list.append(random_search.best_params_)

    # Evaluate on the training set
    train_scores.append(evaluate_model(best_model, X_train_fold, y_train_fold, "Training"))

    # Evaluate on the validation set
    val_scores.append(evaluate_model(best_model, X_val_fold, y_val_fold, "Validation"))

    # Store the outer fold score (f1_weighted)
    outer_scores.append(random_search.best_score_)

# Print the performance of the nested CV
print(f"Nested Cross-Validation F1 Score: {np.mean(outer_scores):.2f} ± {np.std(outer_scores):.2f}")
print("")
print("Average metrics for all folds:")
# Print averaged training set performance for each fold
print_average_performance(train_scores, "Training")
# Print averaged validation set performance for each fold
print_average_performance(val_scores, "Validation")

# Print the best parameters found during hyperparameter tuning
print("Best parameters found during hyperparameter tuning:")
for i, params in enumerate(best_params_list):
    print(f"Fold {i+1}: {params}")

# Print average F1 scores for all hyperparameters tried
print_hyperparameter_performance(random_search.cv_results_)

# Define the best parameters found in nested CV
best_params = best_params_list[np.argmax(outer_scores)]

# Train best model on the entire training data
best_model = XGBClassifier(**best_params, tree_method='hist', device='cuda', eval_metric='merror', random_state=42)
best_model.fit(X_train, y_train)

print("Evaluation of best performing model:")
# Evaluate on the training set
evaluate_model_print(best_model, X_train, y_train, "Training")
# Evaluate on the test set
evaluate_model_print(best_model, X_test, y_test, "Test")

# Print the parameters for the best-performing model
print("Parameters for the best performing model:")
print(best_params)

joblib.dump(best_model, "/content/drive/My Drive/Thesis/Models/Baseline_XGB_training_info.pkl")

Nested Cross-Validation F1 Score: 0.41 ± 0.01

Average metrics for all folds:
Training Set Performance (Averaged):
Accuracy: 0.56 ± 0.05
Precision: 0.58 ± 0.05
Recall: 0.56 ± 0.05
F1 Score: 0.55 ± 0.06
Validation Set Performance (Averaged):
Accuracy: 0.43 ± 0.01
Precision: 0.43 ± 0.00
Recall: 0.43 ± 0.01
F1 Score: 0.41 ± 0.01
Best parameters found during hyperparameter tuning:
Fold 1: {'n_estimators': 400, 'max_depth': 7, 'learning_rate': 0.2}
Fold 2: {'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.2}
Fold 3: {'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.2}
Fold 4: {'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.2}
Fold 5: {'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.2}
Average F1 scores for all hyperparameters tried:
Parameters: {'n_estimators': 100, 'max_depth': 7, 'learning_rate': 0.1}, Average F1 Score: 0.38
Parameters: {'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.1}, Average F1 Score: 0.36
Parameters: {'n_estimators': 200, 'max_

['/content/drive/My Drive/Thesis/Models/Baseline_XGB_training_info.pkl']

## XGBoost with compute_sample_weight




In [None]:
pip install xgboost



In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, f1_score
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import KFold, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer

In [None]:
# Load in the data
drive.mount('/content/drive')
df_final = pd.read_csv('/content/drive/My Drive/Thesis/Data/df_final_2.csv')

df_final = df_final.astype(np.int32)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the "Rating" column
df_final['Rating'] = label_encoder.fit_transform(df_final['Rating'])

In [None]:
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, accuracy_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_sample_weight
from xgboost import XGBClassifier
import numpy as np
import joblib

# Define feature and target
X = df_final.drop(columns=['Rating'])
y = df_final['Rating']

# Split data 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Compute sample weights
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# Define the parameter grid for tuning
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Define the inner and outer cross-validation strategies
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the scoring metric
f1_weighted_scorer = make_scorer(f1_score, average='weighted')

# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=XGBClassifier(tree_method='hist', device='cuda', eval_metric='merror'),
    param_distributions=param_dist,
    n_iter=15,
    cv=inner_cv,
    scoring=f1_weighted_scorer,
    n_jobs=-1
)

# Define output lists
outer_scores = []
val_scores = []
best_params_list = []

# Perform nested CV on the training data
for train_idx, val_idx in outer_cv.split(X_train, y_train):
    # Split the data into training and validation sets
    X_train_fold = X_train.iloc[train_idx]
    X_val_fold = X_train.iloc[val_idx]
    y_train_fold = y_train.iloc[train_idx]
    y_val_fold = y_train.iloc[val_idx]
    sample_weights_fold = sample_weights[train_idx]

    # Fit the model on the training fold
    random_search.fit(X_train_fold, y_train_fold, sample_weight=sample_weights_fold)
    best_model = random_search.best_estimator_

    # Save the best parameters for this fold
    best_params_list.append(random_search.best_params_)

    # Evaluate on the validation set
    val_predictions = best_model.predict(X_val_fold)
    val_acc = accuracy_score(y_val_fold, val_predictions)
    val_prec = precision_score(y_val_fold, val_predictions, average='weighted')
    val_rec = recall_score(y_val_fold, val_predictions, average='weighted')
    val_f1 = f1_score(y_val_fold, val_predictions, average='weighted')

    # Store validation metrics
    val_scores.append((val_acc, val_prec, val_rec, val_f1))

    # Store the outer fold score (f1_weighted)
    outer_scores.append(random_search.best_score_)

# Print the performance of the nested CV
print(f"Nested Cross-Validation F1 Score: {np.mean(outer_scores):.2f} ± {np.std(outer_scores):.2f}")

# Print validation set performance for each fold
val_scores = np.array(val_scores)
print("Validation Set Performance:")
print(f"Accuracy: {val_scores[:, 0].mean():.2f} ± {val_scores[:, 0].std():.2f}")
print(f"Precision: {val_scores[:, 1].mean():.2f} ± {val_scores[:, 1].std():.2f}")
print(f"Recall: {val_scores[:, 2].mean():.2f} ± {val_scores[:, 2].std():.2f}")
print(f"F1 Score: {val_scores[:, 3].mean():.2f} ± {val_scores[:, 3].std():.2f}")

# Print the best parameters found during hyperparameter tuning
print("Best parameters found during hyperparameter tuning:")
for i, params in enumerate(best_params_list):
    print(f"Fold {i+1}: {params}")

# Define the best parameters found in nested CV
best_params = best_params_list[np.argmax(outer_scores)]

# Train best model on the entire training data with sample weights
best_model = XGBClassifier(**best_params, tree_method='hist', device='cuda', eval_metric='merror', random_state=42)
best_model.fit(X_train, y_train, sample_weight=compute_sample_weight(class_weight='balanced', y=y_train))

# Evaluate on the test set
test_predictions = best_model.predict(X_test)
print("Test set performance for best performing model:")
print(f"Accuracy: {accuracy_score(y_test, test_predictions):.2f}")
print(f"Precision: {precision_score(y_test, test_predictions, average='weighted'):.2f}")
print(f"Recall: {recall_score(y_test, test_predictions, average='weighted'):.2f}")
print(f"F1 Score: {f1_score(y_test, test_predictions, average='weighted'):.2f}")

# Print the parameters for the best-performing model
print("Parameters for the best performing model:")
print(best_params)

# Save the best performing model
joblib_file = "/content/drive/My Drive/Thesis/Models/Baseline_XGB_class_imbalance.pkl"
joblib.dump(best_model, joblib_file)

Nested Cross-Validation F1 Score: 0.38 ± 0.00
Validation Set Performance:
Accuracy: 0.38 ± 0.00
Precision: 0.41 ± 0.00
Recall: 0.38 ± 0.00
F1 Score: 0.38 ± 0.01
Best parameters found during hyperparameter tuning:
Fold 1: {'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.2}
Fold 2: {'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.2}
Fold 3: {'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.2}
Fold 4: {'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.2}
Fold 5: {'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.2}
Test set performance for best performing model:
Accuracy: 0.39
Precision: 0.42
Recall: 0.39
F1 Score: 0.39
Parameters for the best performing model:
{'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.2}


['/content/drive/My Drive/Thesis/Models/Baseline_XGB_class_imbalance.pkl']