# Random Forest

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
%matplotlib inline

### Read data

In [None]:
df = pd.read_csv('../../../../datasets/parte1/dataset_cleaned.csv')

### X and y arrays

In [None]:
X = df.drop(['MaxTemp'], axis=1)
y = df['MaxTemp']

#### Train Test Split

Now let's split the data into a training set and a testing set. We will train out model on the training set and then use the test set to evaluate the model.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2023)

#### Training 

Using GridSearchCV to find the best hyperparameters

In [None]:
# Use GridSearchCV para encontrar os melhores hiperparâmetros
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

model = RandomForestRegressor(random_state=2023)

Get the best estimator

In [None]:
grid_search = GridSearchCV(model, param_grid, cv=5, verbose=2, refit=True, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

model = grid_search.best_estimator_

Inspect the best parameters

In [None]:
grid_search.best_params_

Get the best predictions using the trained model

In [None]:
predictions = model.predict(X_test)

### Model evaluation

In [None]:
plt.scatter(y_test, predictions)

In [None]:
sns.histplot((y_test-predictions), bins=50, kde=True)

In [None]:
print('R2:', metrics.r2_score(y_test, predictions))
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))