# Ensemble Methods, Bagging Regressor and Random Forest

## Import Libraries

In [None]:
import pandas as pd
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from keras.utils import plot_model
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import RandomizedSearchCV

In [None]:
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
import numpy as np

## Read the Dataset

In [None]:

def ReadFile(s):
    column=['Id','i1','i2','i3','i4','i5','i6','i7','i8','i9','i10','Y1','Y2','Y3']
    dataset=pd.read_csv(s,sep=",", names=column,skiprows=7)
    dataset.set_index('Id', inplace=True)
    return dataset


In [None]:
data=ReadFile("Dataset_Cup/ML-CUP23-TR.csv")

In [None]:
featureTrain=data.iloc[:,0:10]
TargetTrain=data.iloc[:,10:13]


## Function to compute the mean eucledian error score

In [None]:
# Definizione della metrica dell'errore euclideo medio
def mean_euclidean_error(y_true, y_pred):
    errors = np.linalg.norm(y_true - y_pred, axis=1)
    return np.mean(errors)
custom_scoring = make_scorer(mean_euclidean_error, greater_is_better=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(featureTrain.values, TargetTrain.values, test_size=0.25,random_state=42)

# Bagging Regressor

In [None]:
def get_model(estimator):
  gc_r=BaggingRegressor(estimator=estimator)
  return gc_r

### Try with SVM with the parameters evaluate before

Use the reult of the grid search on SVM

In [None]:

Estimator = MultiOutputRegressor(SVR(C= 500,kernel= 'rbf'))

In [None]:
parameters={
    'n_estimators':[50,100,150],  # riprovare con n_estimators più alti
    'max_samples':[0.2,0.5,0.8],
    'max_features':[0.1,0.5,0.8],
    'bootstrap':[True,False],
    'bootstrap_features':[True,False],
}
clf=GridSearchCV(get_model(estimator=Estimator),parameters,cv=5,n_jobs=-1,scoring=custom_scoring,verbose=2)
grid_result=clf.fit(X_train,y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


In [None]:
gc_r=BaggingRegressor(estimator=Estimator,n_estimators=50,max_features=0.8,max_samples=0.8,bootstrap=False,bootstrap_features=False)

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.25,random_state=42)

In [None]:
result=gc_r.fit(X_tr,y_tr)

In [None]:
result

In [None]:
y_pred=gc_r.predict(X_tr)
MSE=mean_squared_error(y_tr,y_pred)
MEE=mean_euclidean_error(y_tr,y_pred)

In [None]:
print("MSE train: ", MSE)
print("MEE train: ", MEE)

In [None]:
y_pred=gc_r.predict(X_val)
MSE=mean_squared_error(y_val,y_pred)
MEE=mean_euclidean_error(y_val,y_pred)

In [None]:
print("MSE validation: ", MSE)
print("MEE validation: ", MEE)

In [None]:
y_pred=gc_r.predict(X_test)
MSE=mean_squared_error(y_test,y_pred)
MEE=mean_euclidean_error(y_test,y_pred)

In [None]:
print("MSE test: ", MSE)
print("MEE test: ", MEE)

# RandomForestRegressor

### Using the RandomizedSearch because there are  to much settings

In [None]:
parameters={
    'n_estimators':[100,200,250],
    'criterion':['squared_error','absolute_error'],
    'min_samples_split':[2,5,10],
    'min_samples_leaf':[1,2,4],
    'max_depth':[10,40,80],
    'max_features':['auto','sqrt'],
    'bootstrap':[True,False],
}
rf=RandomForestRegressor(random_state=42,criterion='mse')
clf=RandomizedSearchCV(rf,parameters,cv=5,n_jobs=-1,scoring='neg_mean_squared_error',verbose=4,n_iter=150)
grid_result=clf.fit(X_train,y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


### Result of the Estimation
Fitting 5 folds for each of 150 candidates, totalling 750 fits
Best: -3.697801 using {'n_estimators': 250, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'criterion': 'absolute_error', 'bootstrap': False}


Forse Aumentando il numero di estimatori si raggiungono risultati migliori

In [None]:
rf_best = RandomForestRegressor(n_estimators=250,min_samples_split=2,min_samples_leaf=1,max_features='sqrt',max_depth=40,criterion='absolute_error',bootstrap=False)

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.25,random_state=42)

In [None]:
rf_best.fit(X_tr,y_tr)
y_pred=rf_best.predict(X_tr)
MSE=mean_squared_error(y_tr,y_pred)
MEE=mean_euclidean_error(y_tr,y_pred)

In [None]:
print("MSE train: ", MSE)
print("MEE train: ", MEE)

In [None]:
y_pred=rf_best.predict(X_val)
MSE=mean_squared_error(y_val,y_pred)
MEE=mean_euclidean_error(y_val,y_pred)

In [None]:
print("MSE validation: ", MSE)
print("MEE validation: ", MEE)

In [None]:
y_pred=rf_best.predict(X_test)
MSE=mean_squared_error(y_test,y_pred)
MEE=mean_euclidean_error(y_test,y_pred)

In [None]:
print("MSE test: ", MSE)
print("MEE test: ", MEE)