In [1]:
# Google colab version
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load df Google Colab (feature engineered, encoded, scaled/unscaled)
import pandas as pd
df_sampled_unscaled = pd.read_csv("/content/drive/My Drive/Thesis/Data/df_sampled_unscaled.csv")
df_sampled_scaled = pd.read_csv("/content/drive/My Drive/Thesis/Data/df_sampled_scaled.csv")

In [None]:
import pandas as pd
df_final = pd.read_csv("/content/drive/My Drive/Thesis/Data/sample_df.csv")

# RF, HPT, (-2 features) -> Accuracy 0.46 (wrong data split)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

In [None]:
# Separate features and target
X = df_final.drop(columns=['Rating'])
y = df_final['Rating']

# Perform stratified sampling for 40% of the data for increased efficiency
X_sampled, _, y_sampled, _ = train_test_split(X, y, test_size=0.6, stratify=y)


In [None]:
def evaluate_performance(y_true, y_pred):
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.2f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='macro'):.2f}")
    print(f"Recall: {recall_score(y_true, y_pred, average='macro'):.2f}")
    print(f"F1 Score: {f1_score(y_true, y_pred, average='macro'):.2f}")


In [None]:
# Define the parameter grid for tuning
param_dist = {
    'n_estimators': [10, 50, 100],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Define the inner and outer cross-validation strategies
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=RandomForestClassifier(),
                                   param_distributions=param_dist, n_iter=10,
                                   cv=inner_cv, scoring='accuracy',
                                   random_state=42)

# Perform nested cross-validation
nested_scores = cross_val_score(random_search, X_sampled, y_sampled,
                                cv=outer_cv, scoring='accuracy')

# Print the performance of the nested cross-validation
print(f"Nested Cross-Validation Accuracy: {nested_scores.mean():.2f} ± {nested_scores.std():.2f}")

Nested Cross-Validation Accuracy: 0.46 ± 0.00


In [None]:
# Split the data into training, validation, and test sets (60%, 20%, 20%)
X_train, X_temp, y_train, y_temp = train_test_split(X_sampled, y_sampled, test_size=0.4, stratify=y_sampled)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp)

# Fit the best model found by the random search on the full training set
random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_

# Evaluate the best model on the validation set
val_predictions = best_model.predict(X_val)
print("Validation Set Performance:")
evaluate_performance(y_val, val_predictions)

# Evaluate the best model on the test set
test_predictions = best_model.predict(X_test)
print("Test Set Performance:")
evaluate_performance(y_test, test_predictions)


Validation Set Performance:
Accuracy: 0.46
Precision: 0.47
Recall: 0.38
F1 Score: 0.40
Test Set Performance:
Accuracy: 0.45
Precision: 0.47
Recall: 0.38
F1 Score: 0.40


# RF, HPT, (-2 features) -> Accuracy 0.46 (no val scores)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

In [None]:
# Load in the dataframe
df_final = pd.read_csv("/content/drive/My Drive/Thesis/Data/df_sampled_unscaled.csv")
df_final = df_final.drop(columns=['Dev_movie_avg',	'Total_ratings_per_user'])

In [None]:
# Separate features and target
X = df_final.drop(columns=['Rating'])
y = df_final['Rating']

In [None]:
# Split sampled data into 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y,
                                                    random_state=42)


In [None]:
# Evaluation function
def evaluate_performance(y_true, y_pred):
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.2f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='macro'):.2f}")
    print(f"Recall: {recall_score(y_true, y_pred, average='macro'):.2f}")
    print(f"F1 Score: {f1_score(y_true, y_pred, average='macro'):.2f}")


In [None]:
# Define the parameter grid for tuning

param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6]
}

# Define the inner and outer cross-validation strategies
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=RandomForestClassifier(),
                                   param_distributions=param_dist, n_iter=15,
                                   cv=inner_cv, scoring='accuracy',
                                   random_state=42)

# Perform nested cross-validation and store best parameters for each fold
nested_scores = []
best_params = []
best_models = []

for train_idx, test_idx in outer_cv.split(X_train, y_train):
    random_search.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
    nested_scores.append(random_search.best_score_)
    best_params.append(random_search.best_params_)
    best_model = random_search.best_estimator_
    best_models.append(best_model)

# Print the performance of the nested cross-validation
print(f"Nested Cross-Validation Accuracy: {np.mean(nested_scores):.2f} ± {np.std(nested_scores):.2f}")

# Print the best parameters found for each fold
print("Best parameters for each fold:")
for params in best_params:
    print(params)

# Use the best model found in the outer loop
final_best_model = best_models[-1]

# Evaluate the best model on the test set
test_predictions = final_best_model.predict(X_test)
print("Test Set Performance:")
evaluate_performance(y_test, test_predictions)

In [None]:
# Evaluate the best model on the test set
test_predictions = final_best_model.predict(X_test)
print("Test Set Performance:")
evaluate_performance(y_test, test_predictions)

Test Set Performance:
Accuracy: 0.46
Precision: 0.48
Recall: 0.37
F1 Score: 0.39


#  29/10 Random Forest with hyperparameter tuning (new sample_df)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Define feature and target
X = df_final.drop('Rating', axis=1)
y = df_final['Rating']

In [None]:
# Split data 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# Define the parameter grid for hyperparameter optimization
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
# Define the inner and outer cross-validation strategies
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=RandomForestClassifier(),
                                   param_distributions=param_dist, n_iter=15,
                                   cv=inner_cv, scoring='accuracy',
                                   random_state=42)

# Perform nested cross-validation
outer_scores = []
val_scores = []

for train_idx, val_idx in outer_cv.split(X_train, y_train):
    # Split the data into training and validation sets
    X_train_fold = X_train.iloc[train_idx]
    X_val_fold = X_train.iloc[val_idx]
    y_train_fold = y_train.iloc[train_idx]
    y_val_fold = y_train.iloc[val_idx]

    # Fit the model on the training fold
    random_search.fit(X_train_fold, y_train_fold)
    best_model = random_search.best_estimator_

    # Evaluate on the validation set
    val_predictions = best_model.predict(X_val_fold)
    val_acc = accuracy_score(y_val_fold, val_predictions)
    val_prec = precision_score(y_val_fold, val_predictions, average='weighted')
    val_rec = recall_score(y_val_fold, val_predictions, average='weighted')
    val_f1 = f1_score(y_val_fold, val_predictions, average='weighted')

    # Append validation metrics
    val_scores.append((val_acc, val_prec, val_rec, val_f1))

    # Store the outer fold score (accuracy)
    outer_scores.append(random_search.best_score_)

# Print the performance of the nested cross-validation
print(f"Nested Cross-Validation Accuracy: {np.mean(outer_scores):.2f} ± {np.std(outer_scores):.2f}")

# Print validation set performance for each fold
val_scores = np.array(val_scores)
print("Validation Set Performance:")
print(f"Accuracy: {val_scores[:, 0].mean():.2f} ± {val_scores[:, 0].std():.2f}")
print(f"Precision: {val_scores[:, 1].mean():.2f} ± {val_scores[:, 1].std():.2f}")
print(f"Recall: {val_scores[:, 2].mean():.2f} ± {val_scores[:, 2].std():.2f}")
print(f"F1 Score: {val_scores[:, 3].mean():.2f} ± {val_scores[:, 3].std():.2f}")

# Fit the best model found on the entire training set
random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_

# Evaluate on the test set
test_predictions = best_model.predict(X_test)
print("Test Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, test_predictions):.2f}")
print(f"Precision: {precision_score(y_test, test_predictions, average='weighted'):.2f}")
print(f"Recall: {recall_score(y_test, test_predictions, average='weighted'):.2f}")
print(f"F1 Score: {f1_score(y_test, test_predictions, average='weighted'):.2f}")


  _data = np.array(data, dtype=dtype, copy=copy,


Nested Cross-Validation Accuracy: 0.41 ± 0.00
Validation Set Performance:
Accuracy: 0.41 ± 0.00
Precision: 0.42 ± 0.00
Recall: 0.41 ± 0.00
F1 Score: 0.38 ± 0.00
Test Set Performance:
Accuracy: 0.42
Precision: 0.43
Recall: 0.42
F1 Score: 0.39


# **30/10 Random Forest with hyperparameter tuning, sample_df_3010**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
import pandas as pd
df_final = pd.read_csv("/content/drive/My Drive/Thesis/Data/sample_df_3010.csv")

In [None]:
# Define feature and target
X = df_final.drop('Rating', axis=1)
y = df_final['Rating']

In [None]:
# Split data 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=42)

In [None]:
# Define the parameter grid for hyperparameter optimization
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
# Define the inner and outer cross-validation strategies
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=RandomForestClassifier(),
                                   param_distributions=param_dist, n_iter=15,
                                   cv=inner_cv, scoring='accuracy',
                                   random_state=42)



# Perform nested cross-validation
outer_scores = []
val_scores = []

for train_idx, val_idx in outer_cv.split(X_train, y_train):
    # Split the data into training and validation sets
    X_train_fold = X_train.iloc[train_idx]
    X_val_fold = X_train.iloc[val_idx]
    y_train_fold = y_train.iloc[train_idx]
    y_val_fold = y_train.iloc[val_idx]

    # Fit the model on the training fold
    random_search.fit(X_train_fold, y_train_fold)
    best_model = random_search.best_estimator_

    # Evaluate on the validation set
    val_predictions = best_model.predict(X_val_fold)
    val_acc = accuracy_score(y_val_fold, val_predictions)
    val_prec = precision_score(y_val_fold, val_predictions, average='weighted')
    val_rec = recall_score(y_val_fold, val_predictions, average='weighted')
    val_f1 = f1_score(y_val_fold, val_predictions, average='weighted')

    # Append validation metrics
    val_scores.append((val_acc, val_prec, val_rec, val_f1))

    # Store the outer fold score (accuracy)
    outer_scores.append(random_search.best_score_)

# Print the performance of the nested cross-validation
print(f"Nested Cross-Validation Accuracy: {np.mean(outer_scores):.2f} ± {np.std(outer_scores):.2f}")

# Print validation set performance for each fold
val_scores = np.array(val_scores)
print("Validation Set Performance:")
print(f"Accuracy: {val_scores[:, 0].mean():.2f} ± {val_scores[:, 0].std():.2f}")
print(f"Precision: {val_scores[:, 1].mean():.2f} ± {val_scores[:, 1].std():.2f}")
print(f"Recall: {val_scores[:, 2].mean():.2f} ± {val_scores[:, 2].std():.2f}")
print(f"F1 Score: {val_scores[:, 3].mean():.2f} ± {val_scores[:, 3].std():.2f}")

# Fit the best model found on the entire training set
random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_

# Evaluate on the test set
test_predictions = best_model.predict(X_test)
print("Test Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, test_predictions):.2f}")
print(f"Precision: {precision_score(y_test, test_predictions, average='weighted'):.2f}")
print(f"Recall: {recall_score(y_test, test_predictions, average='weighted'):.2f}")
print(f"F1 Score: {f1_score(y_test, test_predictions, average='weighted'):.2f}")


  _data = np.array(data, dtype=dtype, copy=copy,


Nested Cross-Validation Accuracy: 0.40 ± 0.00
Validation Set Performance:
Accuracy: 0.41 ± 0.00
Precision: 0.42 ± 0.00
Recall: 0.41 ± 0.00
F1 Score: 0.38 ± 0.00
Test Set Performance:
Accuracy: 0.41
Precision: 0.42
Recall: 0.41
F1 Score: 0.38


# **RF with all instances (df_final_unscaled) 31/10**

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
import pandas as pd
df_final = pd.read_csv("/content/drive/My Drive/Thesis/Data/df_final_unscaled.csv")

In [3]:
df_final.head()

Unnamed: 0,UserID,MovieID,Rating,Year,Month,Day,Hour,Age,Release_year,Avg_rating_user,Avg_rating_movie,Dev_movie_avg,Avg_dev_movie_avg,Total_ratings_per_user,Female,Male,Academic/educator,Artist,Clerical/admin,College/grad student,Customer service,Doctor/health care,Executive/managerial,Farmer,Homemaker,K-12 student,Lawyer,Other or not specified,Programmer,Retired,Sales/marketing,Scientist,Self-employed,Technician/engineer,Tradesman/craftsman,Unemployed,Writer,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Favourite_Action,Favourite_Adventure,Favourite_Animation,Favourite_Children's,Favourite_Comedy,Favourite_Crime,Favourite_Documentary,Favourite_Drama,Favourite_Fantasy,Favourite_Film-Noir,Favourite_Horror,Favourite_Musical,Favourite_Mystery,Favourite_Romance,Favourite_Sci-Fi,Favourite_Thriller,Favourite_War,Favourite_Western
0,1,1193,5,2000,12,31,23,0,1975,4.19,4.39,1.0,0.3,53,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
1,2,1193,5,2000,12,31,22,6,1975,3.71,4.39,1.0,-0.06,129,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
2,12,1193,4,2000,12,31,0,2,1975,3.83,4.39,0.0,-0.04,23,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,15,1193,4,2000,12,30,19,2,1975,3.32,4.39,0.0,-0.24,201,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,17,1193,5,2000,12,30,7,5,1975,4.08,4.39,1.0,0.4,211,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False


In [5]:
df_final = df_final.drop(columns=['Avg_rating_user',	'Avg_rating_movie', 'Dev_movie_avg', 'Avg_dev_movie_avg'])

In [6]:
df_final.head()

Unnamed: 0,UserID,MovieID,Rating,Year,Month,Day,Hour,Age,Release_year,Total_ratings_per_user,Female,Male,Academic/educator,Artist,Clerical/admin,College/grad student,Customer service,Doctor/health care,Executive/managerial,Farmer,Homemaker,K-12 student,Lawyer,Other or not specified,Programmer,Retired,Sales/marketing,Scientist,Self-employed,Technician/engineer,Tradesman/craftsman,Unemployed,Writer,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Favourite_Action,Favourite_Adventure,Favourite_Animation,Favourite_Children's,Favourite_Comedy,Favourite_Crime,Favourite_Documentary,Favourite_Drama,Favourite_Fantasy,Favourite_Film-Noir,Favourite_Horror,Favourite_Musical,Favourite_Mystery,Favourite_Romance,Favourite_Sci-Fi,Favourite_Thriller,Favourite_War,Favourite_Western
0,1,1193,5,2000,12,31,23,0,1975,53,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
1,2,1193,5,2000,12,31,22,6,1975,129,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
2,12,1193,4,2000,12,31,0,2,1975,23,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,15,1193,4,2000,12,30,19,2,1975,201,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,17,1193,5,2000,12,30,7,5,1975,211,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False


In [7]:
# Define feature and target
X = df_final.drop('Rating', axis=1)
y = df_final['Rating']

In [11]:
# Split data 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=42)

In [12]:
# Define the parameter grid for hyperparameter optimization
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Define the inner and outer cross-validation strategies
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=RandomForestClassifier(),
                                   param_distributions=param_dist, n_iter=15,
                                   cv=inner_cv, scoring='accuracy',
                                   random_state=42)



# Perform nested cross-validation
outer_scores = []
val_scores = []

for train_idx, val_idx in outer_cv.split(X_train, y_train):
    # Split the data into training and validation sets
    X_train_fold = X_train.iloc[train_idx]
    X_val_fold = X_train.iloc[val_idx]
    y_train_fold = y_train.iloc[train_idx]
    y_val_fold = y_train.iloc[val_idx]

    # Fit the model on the training fold
    random_search.fit(X_train_fold, y_train_fold)
    best_model = random_search.best_estimator_

    # Evaluate on the validation set
    val_predictions = best_model.predict(X_val_fold)
    val_acc = accuracy_score(y_val_fold, val_predictions)
    val_prec = precision_score(y_val_fold, val_predictions, average='weighted')
    val_rec = recall_score(y_val_fold, val_predictions, average='weighted')
    val_f1 = f1_score(y_val_fold, val_predictions, average='weighted')

    # Append validation metrics
    val_scores.append((val_acc, val_prec, val_rec, val_f1))

    # Store the outer fold score (accuracy)
    outer_scores.append(random_search.best_score_)

# Print the performance of the nested cross-validation
print(f"Nested Cross-Validation Accuracy: {np.mean(outer_scores):.2f} ± {np.std(outer_scores):.2f}")

# Print validation set performance for each fold
val_scores = np.array(val_scores)
print("Validation Set Performance:")
print(f"Accuracy: {val_scores[:, 0].mean():.2f} ± {val_scores[:, 0].std():.2f}")
print(f"Precision: {val_scores[:, 1].mean():.2f} ± {val_scores[:, 1].std():.2f}")
print(f"Recall: {val_scores[:, 2].mean():.2f} ± {val_scores[:, 2].std():.2f}")
print(f"F1 Score: {val_scores[:, 3].mean():.2f} ± {val_scores[:, 3].std():.2f}")

# Fit the best model found on the entire training set
random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_

# Evaluate on the test set
test_predictions = best_model.predict(X_test)
print("Test Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, test_predictions):.2f}")
print(f"Precision: {precision_score(y_test, test_predictions, average='weighted'):.2f}")
print(f"Recall: {recall_score(y_test, test_predictions, average='weighted'):.2f}")
print(f"F1 Score: {f1_score(y_test, test_predictions, average='weighted'):.2f}")


#  27/10 Random Forest with hyperparameter tuning (-2 features) -> Accuracy ?

Nog runnen zodat het dezelfde code is!!!

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Load df Google Colab (feature engineered, encoded, scaled/unscaled)
df_final = pd.read_csv("/content/drive/My Drive/Thesis/Data/df_sampled_unscaled.csv")
df_final = df_final.drop(columns=['Dev_movie_avg',	'Total_ratings_per_user'])

In [None]:
# Define feature and target
X = df_final.drop('Rating', axis=1)
y = df_final['Rating']

In [None]:
# Split data 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# Define the parameter grid for hyperparameter optimization
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
# Define the inner and outer cross-validation strategies
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=RandomForestClassifier(),
                                   param_distributions=param_dist, n_iter=15,
                                   cv=inner_cv, scoring='accuracy',
                                   random_state=42)

# Perform nested cross-validation
outer_scores = []
val_scores = []

for train_idx, val_idx in outer_cv.split(X_train, y_train):
    # Split the data into training and validation sets
    X_train_fold = X_train.iloc[train_idx]
    X_val_fold = X_train.iloc[val_idx]
    y_train_fold = y_train.iloc[train_idx]
    y_val_fold = y_train.iloc[val_idx]

    # Fit the model on the training fold
    random_search.fit(X_train_fold, y_train_fold)
    best_model = random_search.best_estimator_

    # Evaluate on the validation set
    val_predictions = best_model.predict(X_val_fold)
    val_acc = accuracy_score(y_val_fold, val_predictions)
    val_prec = precision_score(y_val_fold, val_predictions, average='weighted')
    val_rec = recall_score(y_val_fold, val_predictions, average='weighted')
    val_f1 = f1_score(y_val_fold, val_predictions, average='weighted')

    # Append validation metrics
    val_scores.append((val_acc, val_prec, val_rec, val_f1))

    # Store the outer fold score (accuracy)
    outer_scores.append(random_search.best_score_)

# Print the performance of the nested cross-validation
print(f"Nested Cross-Validation Accuracy: {np.mean(outer_scores):.2f} ± {np.std(outer_scores):.2f}")

# Print validation set performance for each fold
val_scores = np.array(val_scores)
print("Validation Set Performance:")
print(f"Accuracy: {val_scores[:, 0].mean():.2f} ± {val_scores[:, 0].std():.2f}")
print(f"Precision: {val_scores[:, 1].mean():.2f} ± {val_scores[:, 1].std():.2f}")
print(f"Recall: {val_scores[:, 2].mean():.2f} ± {val_scores[:, 2].std():.2f}")
print(f"F1 Score: {val_scores[:, 3].mean():.2f} ± {val_scores[:, 3].std():.2f}")

# Fit the best model found on the entire training set
random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_

# Evaluate on the test set
test_predictions = best_model.predict(X_test)
print("Test Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, test_predictions):.2f}")
print(f"Precision: {precision_score(y_test, test_predictions, average='weighted'):.2f}")
print(f"Recall: {recall_score(y_test, test_predictions, average='weighted'):.2f}")
print(f"F1 Score: {f1_score(y_test, test_predictions, average='weighted'):.2f}")

Nested Cross-Validation Accuracy: 0.46 ± 0.00
Validation Set Performance:
Accuracy: 0.46 ± 0.00
Precision: 0.47 ± 0.00
Recall: 0.46 ± 0.00
F1 Score: 0.44 ± 0.00
Test Set Performance:
Accuracy: 0.46
Precision: 0.47
Recall: 0.46
F1 Score: 0.45


# Gradient boosting, no hyperparameter tuning (old) -> Accuracy 0.37

In [None]:
# Load df Google Colab (feature engineered, encoded, scaled/unscaled)
import pandas as pd
df_sampled_unscaled = pd.read_csv("/content/drive/My Drive/Thesis/Data/df_sampled_unscaled.csv")
df_sampled_scaled = pd.read_csv("/content/drive/My Drive/Thesis/Data/df_sampled_scaled.csv")

In [None]:
df_final = df_sampled_unscaled.copy()
df_final = df_final.drop(columns=['Dev_movie_avg',	'Total_ratings_per_user'])

In [None]:
# Gradient boosting without hyperparameter tuning
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming df_final is your dataframe and 'Rating' is the target variable
X = df_final.drop(columns=['Rating'])
y = df_final['Rating']

# Splitting the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating the Gradient Boosting model
gb_baseline = GradientBoostingClassifier(random_state=42)
gb_baseline.fit(X_train, y_train)

# Making predictions
y_pred = gb_baseline.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.3740
Precision: 0.3846
Recall: 0.3740
F1 Score: 0.2983


# Gradient boosting with hyperparameter tuning (-2 features) -> Accuracy 0.46


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

In [None]:
# Load df Google Colab (feature engineered, encoded, scaled/unscaled)
df_sampled_unscaled = pd.read_csv("/content/drive/My Drive/Thesis/Data/df_sampled_unscaled.csv")
df_sampled_scaled = pd.read_csv("/content/drive/My Drive/Thesis/Data/df_sampled_scaled.csv")

In [None]:
df_final = df_sampled_unscaled.copy()
df_final = df_final.drop(columns=['Dev_movie_avg',	'Total_ratings_per_user'])

In [None]:
# Separate features and target
X = df_final.drop(columns=['Rating'])
y = df_final['Rating']

In [None]:
# Split data into 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [None]:
# Define evaluation function
def evaluate_performance(y_true, y_pred):
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.2f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='macro'):.2f}")
    print(f"Recall: {recall_score(y_true, y_pred, average='macro'):.2f}")
    print(f"F1 Score: {f1_score(y_true, y_pred, average='macro'):.2f}")

In [None]:
from joblib import Parallel, delayed

# Define the parameter grid for tuning
param_dist = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 7, 10, 20]
}

# Define the inner and outer cross-validation strategies
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=GradientBoostingClassifier(),
    param_distributions=param_dist,
    n_iter=15,
    cv=inner_cv,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1  # Utilize all available cores for parallel processing
)

# Perform nested cross-validation and store best parameters for each fold
nested_scores = []
best_params = []
best_models = []

def fit_and_evaluate(train_idx, test_idx):
    random_search.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
    best_model = random_search.best_estimator_
    return random_search.best_score_, random_search.best_params_, best_model

results = Parallel(n_jobs=-1)(delayed(fit_and_evaluate)(train_idx, test_idx) for train_idx, test_idx in outer_cv.split(X_train, y_train))

for score, params, model in results:
    nested_scores.append(score)
    best_params.append(params)
    best_models.append(model)

# Print the performance of the nested cross-validation
print(f"Nested Cross-Validation Accuracy: {np.mean(nested_scores):.2f} ± {np.std(nested_scores):.2f}")

# Print the best parameters found for each fold
print("Best parameters for each fold:")
for params in best_params:
    print(params)

# Use the best model found in the outer loop
final_best_model = best_models[-1]

# Evaluate the best model on the test set
test_predictions = final_best_model.predict(X_test)
print("Test Set Performance:")
evaluate_performance(y_test, test_predictions)


Nested Cross-Validation Accuracy: 0.46 ± 0.00
Best parameters for each fold:
{'n_estimators': 50, 'max_depth': 7, 'learning_rate': 0.1}
{'n_estimators': 50, 'max_depth': 7, 'learning_rate': 0.1}
{'n_estimators': 50, 'max_depth': 7, 'learning_rate': 0.1}
{'n_estimators': 50, 'max_depth': 7, 'learning_rate': 0.1}
{'n_estimators': 50, 'max_depth': 7, 'learning_rate': 0.1}
Test Set Performance:
Accuracy: 0.46
Precision: 0.46
Recall: 0.38
F1 Score: 0.40


# Gradient Boosting with hyperparameter tuning (all features) -> Accuracy 1

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

In [None]:
# Load df Google Colab (feature engineered, encoded, scaled/unscaled)
df_sampled_unscaled = pd.read_csv("/content/drive/My Drive/Thesis/Data/df_sampled_unscaled.csv")
df_sampled_scaled = pd.read_csv("/content/drive/My Drive/Thesis/Data/df_sampled_scaled.csv")

In [None]:
df_final = df_sampled_unscaled.copy()

In [None]:
# Separate features and target
X = df_final.drop(columns=['Rating'])
y = df_final['Rating']

In [None]:
# Split data into 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [None]:
# Define evaluation function
def evaluate_performance(y_true, y_pred):
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.2f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='macro'):.2f}")
    print(f"Recall: {recall_score(y_true, y_pred, average='macro'):.2f}")
    print(f"F1 Score: {f1_score(y_true, y_pred, average='macro'):.2f}")

In [None]:
from joblib import Parallel, delayed

# Define the parameter grid for tuning
param_dist = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 7, 10, 20]
}

# Define the inner and outer cross-validation strategies
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=GradientBoostingClassifier(),
    param_distributions=param_dist,
    n_iter=15,
    cv=inner_cv,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1  # Utilize all available cores for parallel processing
)

# Perform nested cross-validation and store best parameters for each fold
nested_scores = []
best_params = []
best_models = []

def fit_and_evaluate(train_idx, test_idx):
    random_search.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
    best_model = random_search.best_estimator_
    return random_search.best_score_, random_search.best_params_, best_model

results = Parallel(n_jobs=-1)(delayed(fit_and_evaluate)(train_idx, test_idx) for train_idx, test_idx in outer_cv.split(X_train, y_train))

for score, params, model in results:
    nested_scores.append(score)
    best_params.append(params)
    best_models.append(model)

# Print the performance of the nested cross-validation
print(f"Nested Cross-Validation Accuracy: {np.mean(nested_scores):.2f} ± {np.std(nested_scores):.2f}")

# Print the best parameters found for each fold
print("Best parameters for each fold:")
for params in best_params:
    print(params)

# Use the best model found in the outer loop
final_best_model = best_models[-1]

# Evaluate the best model on the test set
test_predictions = final_best_model.predict(X_test)
print("Test Set Performance:")
evaluate_performance(y_test, test_predictions)

Nested Cross-Validation Accuracy: 1.00 ± 0.00
Best parameters for each fold:
{'n_estimators': 150, 'max_depth': 10, 'learning_rate': 0.2}
{'n_estimators': 50, 'max_depth': 7, 'learning_rate': 0.2}
{'n_estimators': 50, 'max_depth': 7, 'learning_rate': 0.2}
{'n_estimators': 50, 'max_depth': 7, 'learning_rate': 0.2}
{'n_estimators': 50, 'max_depth': 7, 'learning_rate': 0.2}
Test Set Performance:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
