In [1]:
#!pip install dtreeviz

Collecting dtreeviz
  Downloading dtreeviz-2.2.2-py3-none-any.whl.metadata (2.4 kB)
Collecting graphviz>=0.9 (from dtreeviz)
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
     ---------------------------------------- 0.0/47.0 kB ? eta -:--:--
     ---------------------------------------- 47.0/47.0 kB 1.2 MB/s eta 0:00:00
Collecting colour (from dtreeviz)
  Downloading colour-0.1.5-py2.py3-none-any.whl (23 kB)
Collecting pytest (from dtreeviz)
  Downloading pytest-7.4.3-py3-none-any.whl.metadata (7.9 kB)
Collecting iniconfig (from pytest->dtreeviz)
  Downloading iniconfig-2.0.0-py3-none-any.whl (5.9 kB)
Collecting pluggy<2.0,>=0.12 (from pytest->dtreeviz)
  Downloading pluggy-1.3.0-py3-none-any.whl.metadata (4.3 kB)
Downloading dtreeviz-2.2.2-py3-none-any.whl (91 kB)
   ---------------------------------------- 0.0/91.8 kB ? eta -:--:--
   ---------------------------------------- 91.8/91.8 kB 5.1 MB/s eta 0:00:00
Downloading pytest-7.4.3-py3-none-any.whl (325 kB)
   ------------

In [1]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import dtreeviz
import graphviz
import graphviz.backend as be
from IPython.display import Image, display_svg, SVG
import warnings
warnings.filterwarnings( "ignore", module = "matplotlib\..*" )

In [2]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import matplotlib.font_manager
fm = matplotlib.font_manager
fm._get_fontconfig_fonts.cache_clear()
plt.rcParams['font.family'] = 'Times New Roman'

## Loading data

In [6]:
df= pd.read_csv('../data/cleaned/data_without_outlier.csv')
df

Unnamed: 0,sqft_above,sqft_lot15,sqft_living15,waterfront,view,condition,grade,bedrooms,bathrooms,sqft_basement_cat,floors,yr_built_cat,zip,price
0,1180.0,5650.0,1340.0,0,0,3,7,3,1.00,1,1.0,1,1,221900
1,2170.0,7639.0,1690.0,0,0,3,7,3,2.25,2,2.0,1,1,538000
2,770.0,8062.0,2720.0,0,0,3,6,2,1.00,1,1.0,1,0,180000
3,1050.0,5000.0,1360.0,0,0,5,7,4,3.00,3,1.0,1,1,604000
4,1680.0,7503.0,1800.0,0,0,3,8,3,2.00,1,1.0,1,0,510000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18853,1530.0,1509.0,1530.0,0,0,3,10,6,3.25,1,2.0,1,0,925850
18854,2310.0,7200.0,1830.0,0,0,4,7,2,1.00,2,1.0,1,0,255500
18855,1020.0,2007.0,1020.0,0,0,3,10,3,2.50,1,2.0,1,0,450000
18856,1600.0,1287.0,1410.0,0,0,3,7,3,1.00,1,1.0,1,0,280000


In [8]:
#divide x and y
X = df.drop(columns=["price"])
y = df["price"]


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 1)

# We need to cast again into df to visualize the decision tree
X_train_df = pd.DataFrame(X_train, columns=X.columns)
#y_train_df = pd.DataFrame(y_train, columns=["Price"])

X_test_df = pd.DataFrame(X_test, columns=X.columns)
#y_test_df = pd.DataFrame(y_test, columns=["Price"])

we will fit a decision tree, we will use a crossvalidation to explore the best fitting model

In [12]:
regr = DecisionTreeRegressor(max_depth=5,
                             criterion = 'squared_error',
                             min_samples_split=2,
                             min_samples_leaf = 1,
                             max_features = 6)

In [13]:
from sklearn.model_selection import cross_validate


results = cross_validate(regr, X_train, y_train, cv = 5)
results

{'fit_time': array([0.02433276, 0.02333736, 0.02519774, 0.01773262, 0.01867509]),
 'score_time': array([0.00199962, 0.00458312, 0.00199962, 0.0019989 , 0.        ]),
 'test_score': array([0.50196216, 0.44547676, 0.54386328, 0.54177872, 0.51316575])}

In [14]:
#get average of testscore
print(results['test_score'])
print("The average R2 over the folds is: {:.2f}".format(results['test_score'].mean()))

[0.50196216 0.44547676 0.54386328 0.54177872 0.51316575]
The average R2 over the folds is: 0.51


it´s poorer than the value we got in linear regression.

let´s use grid search to find a better performance

In [15]:
from sklearn.model_selection import GridSearchCV

max_depth_choices = [3,5] # max_depth lists to try
criterion_choices = ['squared_error','absolute_error'] # possible values optimization metrics list
min_samples_split_choices = [2,10] # list of possible values of min_samples_split to try
min_samples_leaf_choices = [2,10] # list of the possible values of min_samples_leaf to try

In [18]:
#create grid, which is a dictionary with hyperparameters
grid = {'max_depth': max_depth_choices,
        'criterion': criterion_choices,
        'min_samples_split': min_samples_split_choices,
        'min_samples_leaf': min_samples_leaf_choices}

In [19]:
model = DecisionTreeRegressor()
grid_search = GridSearchCV(estimator = model, param_grid = grid, cv = 5)

In [20]:
#fit
grid_search.fit(X_train,y_train)

In [21]:
#best r2
print("The best R2 for the best hyperparameters is {:.2f}".format(grid_search.best_score_))

The best R2 for the best hyperparameters is 0.56


In [22]:
print("The performace of the DecisionTree using the best gridsearchcv hyperpameters is {:.2f}".format(grid_search.best_estimator_.score(X_test, y_test)))

The performace of the DecisionTree using the best gridsearchcv hyperpameters is 0.59


In [24]:
#see the best parameter
grid_search.best_params_

{'criterion': 'squared_error',
 'max_depth': 5,
 'min_samples_leaf': 10,
 'min_samples_split': 2}

check random search

In [27]:
import numpy as np

In [28]:
from sklearn.model_selection import RandomizedSearchCV

max_depth_choices= np.random.randint(low=1, high=len(X.columns), size=3) # A random integer between 1 and the number of columns
criterion_choices =  ['squared_error','absolute_error'] # A list of the possible values optimization metrics
min_samples_split_choices = np.random.randint(low=2, high=10, size=3) # A random integer between 1 and the number of columns
min_samples_leaf_choices = np.random.randint(low=2, high=10, size=3) # A random integer between 1 and the number of columns
max_features_choices = np.random.randint(low=1, high=len(X.columns), size=3) # A random integer between 1 and the number of columns

random_grid = {'max_depth': max_depth_choices,
               'criterion': criterion_choices,
               'min_samples_split': min_samples_split_choices,
               'min_samples_leaf': min_samples_leaf_choices,
               'max_features': max_features_choices}

In [30]:
model = DecisionTreeRegressor()
# n_iter is how many random combinations of hyperparameters will test use the computer.
random_search = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter=25, cv = 5, n_jobs = 2)

In [31]:
random_search.fit(X_train,y_train)

In [32]:
random_search.best_params_

{'min_samples_split': 7,
 'min_samples_leaf': 8,
 'max_features': 8,
 'max_depth': 9,
 'criterion': 'squared_error'}

In [33]:
print("The best R2 according to the random search is {:.2f}".format(random_search.best_score_))

The best R2 according to the random search is 0.60


In [38]:
#let´s use it
model = DecisionTreeRegressor (min_samples_split= 10,
                               min_samples_leaf= 8,
                               max_features= 8,
                               max_depth= 9,
                               criterion= 'squared_error')


In [39]:
model.fit(X_train, y_train)
model.score(X_test, y_test).round(2)

0.6

In [None]:
#save model