In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
data

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [4]:
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Price'] = data.target

In [5]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [6]:
df.shape


(20640, 9)

In [7]:
df = df.sample(frac = 0.2)


In [8]:
df.shape

(4128, 9)

In [9]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]


In [10]:
X

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
7125,0.4999,36.0,2.500000,0.833333,15.0,2.500000,33.90,-118.04
19325,3.7143,49.0,6.201087,1.298913,505.0,2.744565,38.53,-122.94
13341,6.7544,11.0,6.966805,1.045643,1690.0,3.506224,34.02,-117.66
17095,3.9290,36.0,4.678241,1.002315,1117.0,2.585648,37.47,-122.24
4256,2.3250,18.0,3.681900,1.165771,2103.0,1.884409,34.10,-118.35
...,...,...,...,...,...,...,...,...
17818,3.7622,16.0,4.874797,0.980488,1606.0,2.611382,37.40,-121.90
2203,3.3239,9.0,5.610955,1.092697,1691.0,2.375000,36.82,-119.85
12471,1.7765,40.0,3.949008,1.124646,689.0,1.951841,38.58,-121.46
10582,2.1719,17.0,5.808743,1.032787,363.0,1.983607,33.73,-117.81


In [11]:
y

7125     1.625
19325    1.488
13341    2.079
17095    3.031
4256     2.500
         ...  
17818    1.500
2203     0.913
12471    1.098
10582    2.613
9972     1.844
Name: Price, Length: 4128, dtype: float64

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

In [14]:
y_pred = model.predict(X_test)
y_pred

array([0.325  , 0.55   , 2.742  , 2.553  , 0.598  , 3.021  , 1.268  ,
       0.668  , 2.006  , 2.125  , 1.144  , 4.352  , 5.00001, 1.926  ,
       4.01   , 0.924  , 2.632  , 1.602  , 5.00001, 1.844  , 3.611  ,
       1.565  , 2.791  , 1.115  , 1.824  , 0.924  , 1.249  , 2.633  ,
       0.702  , 1.864  , 2.51   , 2.601  , 2.531  , 2.642  , 2.475  ,
       1.161  , 4.     , 2.056  , 2.242  , 2.345  , 0.875  , 1.698  ,
       1.599  , 1.128  , 5.00001, 0.955  , 1.602  , 1.305  , 3.975  ,
       0.981  , 1.641  , 1.569  , 4.328  , 2.575  , 0.956  , 1.92   ,
       3.224  , 1.599  , 5.00001, 3.147  , 2.717  , 2.175  , 5.00001,
       1.323  , 1.62   , 1.381  , 1.852  , 2.879  , 1.132  , 2.091  ,
       1.127  , 3.518  , 2.559  , 1.101  , 1.81   , 3.152  , 0.925  ,
       2.317  , 5.00001, 1.151  , 2.165  , 1.471  , 1.5    , 1.75   ,
       2.207  , 1.292  , 1.241  , 1.65   , 2.2    , 3.272  , 2.345  ,
       0.618  , 2.93   , 4.93   , 5.00001, 2.715  , 2.898  , 1.273  ,
       3.267  , 1.54

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred) #to improve the score u can remove the fraction part

0.4832569754343834

In [None]:
#hyerparameter tuning
params = {
    'criterion': ['mse', 'friedman_mse', 'mae', 'poisson'],
    'splitter' : ['best', 'random'],
    'max_depth': [1,2,3,4,10],
    'max_features': ['auto', 'sqrt', 'log2'],
}

In [17]:
regressor = DecisionTreeRegressor(random_state=42)

In [20]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator=regressor, param_grid=params, cv=5, scoring= 'neg_mean_squared_error', verbose=3)

In [21]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
[CV 1/5] END criterion=mse, max_depth=1, max_features=auto, splitter=best;, score=nan total time=   0.0s
[CV 2/5] END criterion=mse, max_depth=1, max_features=auto, splitter=best;, score=nan total time=   0.0s
[CV 3/5] END criterion=mse, max_depth=1, max_features=auto, splitter=best;, score=nan total time=   0.0s
[CV 4/5] END criterion=mse, max_depth=1, max_features=auto, splitter=best;, score=nan total time=   0.0s
[CV 5/5] END criterion=mse, max_depth=1, max_features=auto, splitter=best;, score=nan total time=   0.0s
[CV 1/5] END criterion=mse, max_depth=1, max_features=auto, splitter=random;, score=nan total time=   0.0s
[CV 2/5] END criterion=mse, max_depth=1, max_features=auto, splitter=random;, score=nan total time=   0.0s
[CV 3/5] END criterion=mse, max_depth=1, max_features=auto, splitter=random;, score=nan total time=   0.0s
[CV 4/5] END criterion=mse, max_depth=1, max_features=auto, splitter=random;, score=nan tota

In [23]:
grid_search.best_params_

{'criterion': 'friedman_mse',
 'max_depth': 10,
 'max_features': 'log2',
 'splitter': 'best'}

In [24]:
grid_search.best_score_

np.float64(-0.5821100028437753)

In [25]:
grid_search.best_estimator_.predict(X_test)

array([0.741     , 1.13042857, 2.934     , 1.21742857, 0.73212245,
       3.19390719, 1.58376316, 0.90921429, 2.02541304, 1.38227273,
       1.21056667, 2.11506667, 3.55631   , 3.12657143, 3.337     ,
       1.23508571, 3.19390719, 1.528     , 3.337     , 1.50368889,
       2.70874736, 1.86591667, 2.6418    , 1.00335385, 1.22095455,
       0.90617117, 1.59958974, 2.58666667, 0.73212245, 2.70874736,
       2.18633333, 2.16367797, 2.2253    , 3.815     , 2.52733333,
       1.43001587, 2.9       , 1.06033333, 2.02541304, 2.16367797,
       1.23508571, 2.5815    , 1.56653571, 1.56794805, 4.83951974,
       2.583     , 1.81868   , 1.56794805, 3.57341708, 0.9519697 ,
       1.561     , 1.46681818, 2.553     , 2.15323077, 1.56653571,
       1.67808   , 2.70874736, 1.82302174, 2.402125  , 1.198     ,
       2.89081818, 2.11506667, 2.70874736, 0.90921429, 1.82302174,
       1.4685625 , 0.83386792, 3.37825   , 1.52011765, 2.02541304,
       1.62532203, 2.8406    , 1.67808   , 1.50368889, 1.82302

In [26]:
y_pred = grid_search.best_estimator_.predict(X_test)
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.568431194774702