# Decision Tree and Random Forest Regressors

In [22]:
# Data wrangling
import pandas as pd
import numpy as np

# Machine learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# object import/export
from joblib import dump, load

In [23]:
# some constants
DT_SEARCH = '../../search_results/dt_grid_search.joblib'
DT_BEST = '../../classifiers/best_dt.joblib'

RF_SEARCH = '../../search_results/rf_grid_search.joblib'
RF_BEST = '../../classifiers/best_rf.joblib'

First of all, we load the dataset, and we remove the price.

In [24]:
data = pd.read_csv('../../data/ready.csv').drop(['Unnamed: 0'], axis = 1)

X = data.drop('price', axis = 1)
Y = data['price']

display(X)

Unnamed: 0,departure_time,stops,arrival_time,class,duration,days_left,Bangalore_source,Chennai_source,Delhi_source,Hyderabad_source,...,Delhi_dest,Hyderabad_dest,Kolkata_dest,Mumbai_dest,AirAsia,Air_India,GO_FIRST,Indigo,SpiceJet,Vistara
0,3.0,0.0,4.0,0.0,2.17,1,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,2.33,1,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,2.17,1,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,2.0,0.0,2.25,1,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,1.0,0.0,2.33,1,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300148,1.0,1.0,3.0,1.0,10.08,49,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
300149,2.0,1.0,4.0,1.0,10.42,49,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
300150,0.0,1.0,4.0,1.0,13.83,49,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
300151,0.0,1.0,3.0,1.0,10.00,49,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


Then, we split the dataset in train and test set.

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)
print("X_train shape: ", X_train.shape)
print("Y_train shape: ", Y_train.shape)
print("X_test shape: ", X_test.shape)
print("Y_test shape: ", Y_test.shape)

X_train shape:  (210107, 24)
Y_train shape:  (210107,)
X_test shape:  (90046, 24)
Y_test shape:  (90046,)


## 1. Decision Tree Regressor

In [26]:
dt = DecisionTreeRegressor()
param_grid = {
    'criterion' : ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    'max_depth' : [ None , 10 , 20 ]
}

In [27]:
# check if we already have results
try:
    # load them
    search_dt = load(DT_SEARCH)
except:
    # run the grid search
    search_dt = GridSearchCV(dt, param_grid, refit='neg_mean_squared_error')
    search_dt.fit(X_train, Y_train)
    # save the grid search results and the best classifier to file
    dump(search_dt, DT_SEARCH)
    dump(search_dt.best_estimator_, DT_BEST)

print("Best parameters (CV score=%0.3f):" % search_dt.best_score_)
print(search_dt.best_params_)

Best parameter (CV score=0.977):
{'criterion': 'friedman_mse', 'max_depth': 20}


## 2. Random Forest Regressor

In [None]:
# for each tree, we use the best parameters we discovered before
rf = RandomForestRegressor(max_depth=20, criterion='friedman_mse', max_features=1/3)
param_grid = {
    'n_estimators' : [ 25, 50 ]
}

In [None]:
# check if we already have results
try:
    # load them
    search_rf = load(RF_SEARCH)
except:
    # run the grid search
    search_rf = GridSearchCV(rf, param_grid, refit='neg_mean_squared_error')
    search_rf.fit(X_train, Y_train)
    # save the grid search results and the best classifier to file
    dump(search_rf, RF_SEARCH)
    dump(search_rf.best_estimator_, RF_BEST)

print("Best parameters (CV score=%0.3f):" % search_rf.best_score_)
print(search_rf.best_params_)