# Hyperparameter tuning

In [51]:
import pandas as pd
import numpy as np
import seaborn as sns
import math
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import GridSearchCV
import itertools
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import make_scorer, mean_absolute_error

In [11]:
data = pd.read_csv('../../datasets/data-stemmed-R.csv')
Y = data['engagement.rate']
X = data.drop(columns=['engagement.rate'])
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X)


**DISCLAIMER:** we want a robust result about the optimal hyperparameters, so we are going to implement a CV. 
Since our future models will be trained in training sets with size $\frac{9}{10}|D|$, we can do hyperparameter tuning with CV on the entire dataset (in order to mantain the same proportion)

# Decision tree

In [65]:
param_grid = {'min_samples_split': [2, 11, 16, 26,31, 51, 101, 151, 201,501]}
# Create RegressorTree
dt = DecisionTreeRegressor()

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(dt, param_grid, cv=10, scoring="neg_mean_absolute_error")
grid_search.fit(X, Y)

# Print the best hyperparameters
dt_best_params = grid_search.best_params_

In [69]:
print(f"Decision trees Best hyperparameters: {dt_best_params}")

Decision trees Best hyperparameters: {'min_samples_split': 101}


# Knn

In [71]:
# Define the range of neighbors
param_grid = {'n_neighbors': [5, 10, 15, 20, 50, 100]}

# Create KNeighborsRegressor
knn = KNeighborsRegressor()

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(knn, param_grid, cv=10, scoring="neg_mean_absolute_error")
grid_search.fit(X, Y)

# Print the best hyperparameters
knn_best_params = grid_search.best_params_

In [72]:
print(f"KNN Best hyperparameters: {knn_best_params}")

KNN Best hyperparameters: {'n_neighbors': 50}


# Random Forest

In [None]:
n_estimators= [100, 250, 500]
min_samples_split= [2, 11, 26, 51] #corrisponde a 1+n_min di medvet
max_features= [math.floor(math.sqrt(X.shape[1])), X.shape[1] // 3]

param_grid = {
    'n_estimators': n_estimators,
    'min_samples_split': min_samples_split,
    'max_features': max_features
}

# Create KNeighborsRegressor
rf = RandomForestRegressor()

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(rf, param_grid, cv=10, scoring="neg_mean_absolute_error")
grid_search.fit(X, Y)

# Print the best hyperparameters
rf_best_params = grid_search.best_params_

In [62]:
print(f"Random forest Best hyperparameters: {rf_best_params}")

Random forest Best hyperparameters: {'max_features': 94, 'min_samples_split': 2, 'n_estimators': 500}


# SVR

In [75]:
param_grid = {'kernel': ['linear', 'poly', 'rbf']}

sv = SVR()

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(sv, param_grid, cv=10, scoring="neg_mean_absolute_error")
grid_search.fit(X, Y)

# Print the best hyperparameters
sv_best_params = grid_search.best_params_

In [76]:
print(f"SVR best kernel: {sv_best_params}")

SVR best kernel: {'kernel': 'rbf'}
