# Trees Model Regressions

## Packages

In [6]:
import pandas as pd

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import matplotlib.pyplot as plt



## Import Data

In [7]:
df = pd.read_csv('train_df.csv')
df.head()

Unnamed: 0,age,Pedu,traveltime,studytime,failures,famrel,gooutAlc,health,sex,addressInternet,...,Mjob_services,Mjob_teacher,Fjob_at_home,Fjob_health,Fjob_services,Fjob_teacher,reason_course,reason_home,reason_reputation,G3
0,1.021751,-0.940091,-0.642435,-1.233786,0.895343,-1.053136,1.570863,1.039751,1,1,...,1,0,0,0,1,0,0,0,0,8
1,0.238078,-0.940091,-0.642435,-0.042232,-0.449374,0.062115,1.570863,0.320484,1,1,...,0,0,0,0,0,0,0,1,0,13
2,0.238078,-0.940091,0.791247,-0.042232,-0.449374,0.062115,0.657033,1.039751,0,1,...,1,0,0,0,0,0,1,0,0,12
3,-0.545595,1.02465,-0.642435,-1.233786,-0.449374,-1.053136,-1.170628,1.039751,1,1,...,0,1,0,0,0,1,1,0,0,0
4,1.805423,-1.922461,-0.642435,-0.042232,0.895343,0.062115,-0.256798,-0.398784,0,1,...,0,0,0,0,0,0,0,0,1,10


In [8]:
X = df.drop(['G3'], axis = 1)
y = df[['G3']]

## Decision Tree

In [9]:
model = DecisionTreeRegressor()

mse = cross_val_score(model, X, y, cv=8, scoring='neg_mean_squared_error')
r2 = cross_val_score(model, X, y, cv=8, scoring='r2')

print(f"Decision Tree: \tMSE: {mse.mean()} \tR2: {r2.mean()}")

Decision Tree: 	MSE: -45.330268429487184 	R2: -1.1571315375868862


In [10]:
# Define the hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'min_impurity_decrease': [0.0, 0.01, 0.05],
    'ccp_alpha': [0.0, 0.01, 0.05]
}

# Use grid search to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=8, scoring = 'r2')
grid_search.fit(X, y)


grid_search.best_params_


{'ccp_alpha': 0.01,
 'max_depth': 3,
 'max_features': 'sqrt',
 'min_impurity_decrease': 0.01,
 'min_samples_leaf': 2,
 'min_samples_split': 2}

In [11]:
# Define the hyperparameter grid
param_grid = {
    'max_depth': [1,2,3,4],
    'min_samples_split': [4, 5, 6],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'min_impurity_decrease': [0.0, 0.01, 0.05, 0.1, 0.15],
    'ccp_alpha': [0.0, 0.01]
}

# Use grid search to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=8, scoring = 'r2')
grid_search.fit(X, y)


grid_search.best_params_

{'ccp_alpha': 0.01,
 'max_depth': 3,
 'max_features': 'log2',
 'min_impurity_decrease': 0.15,
 'min_samples_leaf': 1,
 'min_samples_split': 4}

In [12]:
model = DecisionTreeRegressor(
        max_depth = 1,
        max_features = None,
        min_samples_leaf = 1,
        min_samples_split = 4,
        min_impurity_decrease = 0.0,
        ccp_alpha= 0.0,)

mse = cross_val_score(model, X, y, cv=8, scoring='neg_mean_squared_error')
r2 = cross_val_score(model, X, y, cv=8, scoring='r2')

print(f"Decision Tree: \tMSE: {mse.mean()} \tR2: {r2.mean()}")

Decision Tree: 	MSE: -19.17128234542287 	R2: 0.09058720982513556


Here we try the same model but using feature selected in the backward eliminaiton unsing linear models: here we can see that we obtain the same result as the previous model!

In [13]:
features = ['Pedu', 'studytime', 'failures', 'gooutAlc', 'sex', 'addressInternet', 'famsize', 'schoolsup', 'higher', 'romantic', 'Mjob_health', 'Mjob_services', 'Mjob_teacher']
model = DecisionTreeRegressor(
        max_depth = 1,
        max_features = None,
        min_samples_leaf = 1,
        min_samples_split = 4,
        min_impurity_decrease = 0.0,
        ccp_alpha= 0.0,)

mse = cross_val_score(model, X[features], y, cv=8, scoring='neg_mean_squared_error')
r2 = cross_val_score(model, X[features], y, cv=8, scoring='r2')

print(f"Decision Tree: \tMSE: {mse.mean()} \tR2: {r2.mean()}")

# Embedded feature selection!

Decision Tree: 	MSE: -19.17128234542287 	R2: 0.09058720982513556


## Random Forest

In [14]:

model = RandomForestRegressor()

mse = cross_val_score(model, X, y.values.ravel(), cv=8, scoring='neg_mean_squared_error')
r2 = cross_val_score(model, X, y.values.ravel(), cv=8, scoring='r2')

print(f"Random Forest: \tMSE: {mse.mean()} \tR2: {r2.mean()}")



Random Forest: 	MSE: -20.692394900921474 	R2: 0.040998844753737884


In [15]:
model = RandomForestRegressor()

param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [5, 10, 20, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Use grid search to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5, scoring = 'r2')
grid_search.fit(X, y.values.ravel())

grid_search.best_params_

{'max_depth': 40,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 50}

In [16]:
model = RandomForestRegressor(
    max_depth = 40,
    max_features = 'log2',
    min_samples_leaf = 2,
    min_samples_split = 10,
    n_estimators = 50
)

mse = cross_val_score(model, X, y.values.ravel(), cv=8, scoring='neg_mean_squared_error')
r2 = cross_val_score(model, X, y.values.ravel(), cv=8, scoring='r2')

print(f"Random Forest: \tMSE: {mse.mean()} \tR2: {r2.mean()}")

Random Forest: 	MSE: -18.893458900066932 	R2: 0.1251259314323543


## Gradient Boosting

In [17]:
model = GradientBoostingRegressor()

mse = cross_val_score(model, X, y.values.ravel(), cv=8, scoring='neg_mean_squared_error')
r2 = cross_val_score(model, X, y.values.ravel(), cv=8, scoring='r2')

print(f"Gradient Boosting: \tMSE: {mse.mean()} \tR2: {r2.mean()}")

Gradient Boosting: 	MSE: -22.268362919222575 	R2: -0.07950414677187043


In [18]:
model = GradientBoostingRegressor()


param_grid = {
    'n_estimators': [100, 200, 400],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring = 'r2')
grid_search.fit(X, y.values.ravel())

print(grid_search.best_score_)
print(grid_search.best_params_)

0.12477983017873043
{'learning_rate': 0.01, 'max_depth': 7, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}


In [19]:
model = GradientBoostingRegressor(
    learning_rate = 0.01,
    max_depth = 5,
    max_features = 'log2',
    min_samples_leaf = 4,
    min_samples_split = 2,
    n_estimators = 200
)

mse = cross_val_score(model, X, y.values.ravel(), cv=8, scoring='neg_mean_squared_error')
r2 = cross_val_score(model, X, y.values.ravel(), cv=8, scoring='r2')

print(f"Gradient Boosting: \tMSE: {mse.mean()} \tR2: {r2.mean()}")

Gradient Boosting: 	MSE: -18.891804319690642 	R2: 0.1126343408833183


In [20]:
model = GradientBoostingRegressor(
    learning_rate = 0.01,
    max_depth = 3,
    max_features = 'log2',
    min_samples_leaf = 4,
    min_samples_split = 2,
    n_estimators = 200
)

mse = cross_val_score(model, X, y.values.ravel(), cv=8, scoring='neg_mean_squared_error')
r2 = cross_val_score(model, X, y.values.ravel(), cv=8, scoring='r2')

print(f"Gradient Boosting: \tMSE: {mse.mean()} \tR2: {r2.mean()}")

Gradient Boosting: 	MSE: -18.869287101971874 	R2: 0.09672722357002891


## PCA components

In [21]:
import numpy as np

from sklearn.decomposition import PCA

### Full dataset

In [34]:
pca = PCA()
pca_result = pca.fit_transform(X)

In [23]:
X_pca = pca_result[:,:7]

In [25]:
model = GradientBoostingRegressor()


param_grid = {
    'n_estimators': [100, 200, 400],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring = 'r2')
grid_search.fit(X_pca, y.values.ravel())

print(grid_search.best_score_)
print(grid_search.best_params_)

0.08435526275296071
{'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}


In [33]:
model = GradientBoostingRegressor(
    learning_rate = 0.01,
    max_depth = 3,
    max_features = 'sqrt',
    min_samples_leaf = 4,
    min_samples_split = 10,
    n_estimators = 100
)

mse = cross_val_score(model, X_pca, y.values.ravel(), cv=8, scoring='neg_mean_squared_error')
r2 = cross_val_score(model, X_pca, y.values.ravel(), cv=8, scoring='r2')

print(f"Gradient Boosting: \tMSE: {mse.mean()} \tR2: {r2.mean()}")

Gradient Boosting: 	MSE: -20.29395440511029 	R2: 0.05630056858693097
