In [6]:
import numpy as np 
import pandas as pd 
import matplotlib.axes as ax
import matplotlib.pyplot as plt 

from sklearn import datasets 
import warnings 
warnings.filterwarnings("ignore")

In [10]:
data = datasets.fetch_california_housing()

In [12]:
data

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

# Data

In [14]:
x = data.data
y = data.target

In [17]:
features = data.feature_names
features

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

# Train test Split

In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state= 42)

In [19]:
print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

(16512, 8) (4128, 8)
(16512,) (4128,)


# Random Forest

In [24]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(100)
rf = rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

In [25]:
y_pred.shape

(4128,)

In [27]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_pred, y_test)

0.2525828701198457

# K-fold Cross Validation

In [33]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, shuffle = False)

for train_index, test_index in kfold.split(x):
    print("Train -\n", train_index, "\nTest -\n", test_index)
    print("==========="*5,"\n")

Train -
 [ 4128  4129  4130 ... 20637 20638 20639] 
Test -
 [   0    1    2 ... 4125 4126 4127]

Train -
 [    0     1     2 ... 20637 20638 20639] 
Test -
 [4128 4129 4130 ... 8253 8254 8255]

Train -
 [    0     1     2 ... 20637 20638 20639] 
Test -
 [ 8256  8257  8258 ... 12381 12382 12383]

Train -
 [    0     1     2 ... 20637 20638 20639] 
Test -
 [12384 12385 12386 ... 16509 16510 16511]

Train -
 [    0     1     2 ... 16509 16510 16511] 
Test -
 [16512 16513 16514 ... 20637 20638 20639]



# Again Random Forest 

In [37]:
rf1 = RandomForestRegressor()

n_est = [100, 200, 500]

for n_estimator in n_est :
    error = []
    for train_index, test_index in kfold.split(x):
        x_train, x_test = x[train_index],x[test_index]
        y_train, y_test = y[train_index],y[test_index]

        rf1 = RandomForestRegressor(100)
            
        rf1 = rf1.fit(x_train, y_train)
        
        y_pred = rf1.predict(x_test)
        
        error.append (mean_squared_error(y_test,y_pred))    
    print("Results for n_estimators: ", n_estimator)
    print(error)
    print("Average Error = ", sum(error)/len(error))

Results for n_estimators:  100
[0.5240164745949119, 0.35165922724083737, 0.37281511070190304, 0.45204961553161466, 0.46271027195271447]
Average Error =  0.4326501400043963
Results for n_estimators:  200
[0.5185771541092332, 0.3481788755578832, 0.37535227836059015, 0.4452797311440602, 0.46730158649402875]
Average Error =  0.4309379251331591
Results for n_estimators:  500
[0.5303652356257684, 0.3452342553026066, 0.3763786112303512, 0.4518901174234504, 0.4707077694804153]
Average Error =  0.43491519781251836


In [36]:
error

[0.5320180461124817,
 0.34481537637913023,
 0.3749978383705902,
 0.44723757433048317,
 0.461729349216705]