# Data loading

In [4]:
import pandas as pd
housing=pd.read_csv("../data/train/housing_train_processed.csv")
housing_labels = pd.read_csv("../data/train/housing_train.csv")[['median_house_value']]


# Model fitting

In [5]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing, housing_labels)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [9]:
from sklearn.metrics import root_mean_squared_error
housing_predictions = tree_reg.predict(housing)
tree_rmse = root_mean_squared_error(housing_labels, housing_predictions)
tree_rmse

0.0

In [10]:
housing_labels.iloc[:5].values

array([[458300.],
       [483800.],
       [101700.],
       [ 96100.],
       [361800.]])

# Cross validation

In [11]:
from sklearn.model_selection import cross_val_score

tree_rmses = -cross_val_score(tree_reg, housing, housing_labels,
                              scoring="neg_root_mean_squared_error", cv=10)

In [12]:
pd.Series(tree_rmses).describe()

count       10.000000
mean     67380.873339
std       3015.975551
min      63769.731106
25%      65074.952342
50%      67575.853331
75%      68353.957633
max      72686.603587
dtype: float64

# Hyperparameter tuning

In [14]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'max_features': [None, 'sqrt', 'log2']
}
grid_search = GridSearchCV(tree_reg, param_grid, cv=3,
                           scoring='neg_root_mean_squared_error')
grid_search.fit(housing, housing_labels)

0,1,2
,estimator,DecisionTreeR...ndom_state=42)
,param_grid,"{'max_depth': [None, 5, ...], 'max_features': [None, 'sqrt', ...], 'min_samples_split': [2, 5, ...]}"
,scoring,'neg_root_mean_squared_error'
,n_jobs,
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,10
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [15]:
grid_search.best_params_

{'max_depth': 10, 'max_features': None, 'min_samples_split': 10}

In [16]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
20,0.105218,0.000336,0.000926,4e-05,10,,10,"{'max_depth': 10, 'max_features': None, 'min_s...",-60480.187728,-60376.623779,-60790.567653,-60549.126386,175.882132,1
18,0.107912,0.001848,0.001094,0.000132,10,,2,"{'max_depth': 10, 'max_features': None, 'min_s...",-60830.501486,-61626.698539,-61577.689778,-61344.963268,364.329209,2
19,0.108925,0.001216,0.001013,1.3e-05,10,,5,"{'max_depth': 10, 'max_features': None, 'min_s...",-60948.642892,-61641.017242,-61542.258477,-61377.306204,305.780422,3
35,0.026465,0.001721,0.00114,0.000143,15,log2,10,"{'max_depth': 15, 'max_features': 'log2', 'min...",-62234.359023,-62034.952484,-63618.846206,-62629.385904,704.374194,4
22,0.02443,0.000738,0.000827,1.8e-05,10,sqrt,5,"{'max_depth': 10, 'max_features': 'sqrt', 'min...",-61709.185934,-63855.195861,-64016.527223,-63193.636339,1051.729276,5


# Model saving

In [17]:
import pickle
with open('../models/DecissionTree_model.pkl','wb') as f:
    pickle.dump(tree_reg,f)