In [0]:
import numpy as np # importing numpy for numerical computation
from sklearn.datasets import load_boston # here we are using sklearn's boston dataset
from sklearn.metrics import mean_squared_error # importing mean_squared_error metric
from sklearn.tree import DecisionTreeRegressor
import random
from prettytable import PrettyTable

In [0]:
boston = load_boston()
x=boston.data #independent variables
y=boston.target #target variable

In [25]:
x.shape

(506, 13)

In [26]:
x[:5]

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, 0.0000e+00, 5.3800e-01,
        6.5750e+00, 6.5200e+01, 4.0900e+00, 1.0000e+00, 2.9600e+02,
        1.5300e+01, 3.9690e+02, 4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        6.4210e+00, 7.8900e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9690e+02, 9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        7.1850e+00, 6.1100e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9283e+02, 4.0300e+00],
       [3.2370e-02, 0.0000e+00, 2.1800e+00, 0.0000e+00, 4.5800e-01,
        6.9980e+00, 4.5800e+01, 6.0622e+00, 3.0000e+00, 2.2200e+02,
        1.8700e+01, 3.9463e+02, 2.9400e+00],
       [6.9050e-02, 0.0000e+00, 2.1800e+00, 0.0000e+00, 4.5800e-01,
        7.1470e+00, 5.4200e+01, 6.0622e+00, 3.0000e+00, 2.2200e+02,
        1.8700e+01, 3.9690e+02, 5.3300e+00]])

In [0]:
def generating_samples(input_data, target_data):

    '''In this function, we will write code for generating 30 samples '''
    # you can use random.choice to generate random indices without replacement
    # Please have a look at this link https://docs.scipy.org/doc/numpy-1.16.0/reference/generated/numpy.random.choice.html for more details
    # Please follow above pseudo code for generating samples 
    x = input_data
    y = target_data

    total_columns = [i for i in range(0, len( x[0] ) ) ]
    # Here, taking 3 columns in each sample
    selecting_columns = np.random.choice(total_columns ,size=3, replace=False )

    # getting 303 random rows
    indx_list_1st = [i for i in range(0, len( x ) ) ]
    no_of_1st_sample = int(len( x ) * 0.60)
    #
    selecting_rows = np.random.choice(indx_list_1st ,size=no_of_1st_sample , replace=False )
    sample_data = x[ selecting_rows ]
    sample_data = sample_data[0: , selecting_columns]

    # getting 203 random rows out of sample data
    indx_list_2nd = [i for i in range(0, len( sample_data ) ) ]
    no_of_2nd_sample = 1 + int(len( x ) * 0.40)
    #
    replicating_rows = np.random.choice(indx_list_2nd ,size=no_of_2nd_sample, replace=False )
    replicated_sample_data = sample_data[replicating_rows]

    #target
    target_of_sample_data = y[ selecting_rows ]
    target_of_replicated_sample_data = target_of_sample_data[ replicating_rows ]

    #temp = target_of_sample_data.reshape(-1,1)
    final_sample_data = np.vstack(( sample_data , replicated_sample_data ))
    final_target_data = np.vstack(( target_of_sample_data.reshape(-1,1) , target_of_replicated_sample_data.reshape(-1 , 1 ) ))

    return final_sample_data , final_target_data, selecting_rows, selecting_columns

In [0]:
# Use generating_samples function to create 30 samples 
# store these created samples in a list
list_input_data =[]
list_output_data =[]
list_selected_row= []
list_selected_columns=[]

def get_30_samples():
  list_input_data =[]
  list_output_data =[]
  list_selected_row= []
  list_selected_columns=[]

  for i in range(0,30):
    a,b,c,d = generating_samples(x, y)
    list_input_data.append(a)
    list_output_data.append(b)
    list_selected_row.append(c)
    list_selected_columns.append(d)

  return list_input_data, list_output_data, list_selected_row, list_selected_columns

# get 30 samples
list_input_data, list_output_data, selected_rows, selected_columns = get_30_samples()

In [0]:
def get_std_of_samples(list_input_data):
  input_data_a = np.array(list_input_data)
  std = input_data_a.std()
  #std / np.sqrt(len(list_input_data))
  return std

def get_mean_of_samples(input_data):
  input_data_a = np.array(input_data)
  mean = input_data_a.mean()
  #sample_mean.clear()
  #sample_mean.append(mean)
  return mean

def get_list_of_all_models():
  l_input_data, l_output_data, selected_rows, selected_columns = get_30_samples()
  #reset_mean_of_samples( list_input_data )
  list_input_data.clear()
  list_output_data.clear()
  list_selected_row.clear()
  list_selected_columns.clear()

  list_input_data.append( l_input_data )
  list_output_data.append( l_output_data )
  list_selected_row.append( selected_rows )
  list_selected_columns.append( selected_columns )

  list_of_all_models = []
  model = [0 for i in range(0, len(l_input_data))]
  for indx , input_data in enumerate(l_input_data):
    #print(indx)
    model[indx] = DecisionTreeRegressor(max_depth=None,ccp_alpha=0.001)
    model[indx] = model[indx].fit(input_data , l_output_data[indx] )
    list_of_all_models.append( model[indx] )
  return list_of_all_models

In [0]:
def column_sampling(size_of_sample):
    total_columns = [i for i in range(0, len( x[0] ) ) ]
    selecting_columns = np.random.choice(total_columns , size=size_of_sample , replace=False )
    return selecting_columns

list_of_models = []

def get_y_pred_for_all_models():
  list_of_all_models = get_list_of_all_models()
  list_of_models.clear()
  list_of_models.append( list_of_all_models )
  
  y_pred_list = []
  for indx , model in enumerate(list_of_all_models):
    column_sampled = column_sampling(3)
    y_pred_list.append(  model.predict(x[0: , column_sampled ]) )
  return y_pred_list

def get_mean_squared_error():
  y_pred_list = get_y_pred_for_all_models()
  
  y_pred = np.median(y_pred_list , axis = 0)

  #https://www.kaggle.com/shreayan98c/boston-house-price-prediction

  mean_squared_error1 = mean_squared_error(y, y_pred)

  return mean_squared_error1

In [0]:

def load_not_covered_datapoints():
  total_columns = [i for i in range(0, len( x[0] ) ) ]
  total_rows = [i for i in range(0, len( x ) ) ]

  not_covered_rows = []
  not_covered_columns = []
  # for OOB score, we are considering only those datapoints which are not incuded in previous prediction
  for idx , sample_rows in enumerate( list_selected_row[0] ):

    not_covered_rows.append( set(total_rows) - set(sample_rows) )
    
    not_covered_columns.append( set(total_columns) - set( list_selected_columns[0][idx] ) )

  return not_covered_rows , not_covered_columns


def get_all_oobmodels():
  not_covered_rows , not_covered_columns = load_not_covered_datapoints()

  list_of_all_oobmodels = []
  oobmodel = [0 for i in range(0, len(not_covered_rows))]

  for indx , rows in enumerate(not_covered_rows):
    #print(indx)
    oobmodel[indx] = DecisionTreeRegressor(max_depth=None, ccp_alpha=0.001)
    oob_sample_data = x[ list(not_covered_rows[indx]) ]
    oob_sample_data = oob_sample_data[0: , list(not_covered_columns[indx])]
    
    oobmodel[indx] = oobmodel[indx].fit( oob_sample_data , y[ list(not_covered_rows[indx]) ] )
    list_of_all_oobmodels.append( oobmodel[indx] )

  return list_of_all_oobmodels


def column_sampling_oob():
    total_columns = [i for i in range(0, len( x[0] ) ) ]
    selecting_columns = np.random.choice(total_columns ,size=10, replace=False )
    return selecting_columns

def get_y_pred_for_oob():
  list_of_all_oobmodels = get_all_oobmodels()
  y_pred_oob = []

  for indx , model in enumerate(list_of_all_oobmodels):
    column_sampled = column_sampling_oob()
    y_pred_oob.append(  model.predict(x[0: , column_sampled ]) )

  return y_pred_oob


def get_ooB_score():
  y_pred_ooB_score = get_y_pred_for_oob()

  y_pred_for_oob = np.median(np.asarray( y_pred_ooB_score ), axis = 0)

  #

  oob_score = mean_squared_error(y, y_pred_for_oob)
  return oob_score

In [0]:
MSE = []
OOB = []

def get_35_times_MSE_OOB():
  MSE = []
  OOB = []
  sample_mean_list = []
  sample_std_list = []

  for i in range(0,35):
    MSE.append( get_mean_squared_error() )
    OOB.append( get_ooB_score() )
    sample_mean_list.append( get_mean_of_samples( list_input_data[0] ) )
    sample_std_list.append( get_std_of_samples( list_input_data[0] ))

  return MSE , OOB , sample_mean_list , sample_std_list

MSE , OOB , sample_mean_list , sample_std_list = get_35_times_MSE_OOB()

In [39]:
sample_size = len(list_input_data[0])
left_limit = []
right_limit = []

#refrerence Central_Limit_theorem.ipynb
def compute_CI():
  left_limit.clear()
  right_limit.clear()

  for idx in range(len(MSE)):
    sample_mean = sample_mean_list[idx]
    sample_std = sample_std_list[idx]
    #left_limit.append( np.round(sample_mean - 2*(sample_std/np.sqrt(sample_size)), 3) )
    #right_limit.append( np.round(sample_mean + 2*(sample_std/np.sqrt(sample_size)), 3))
    left_limit.append( np.round(sample_mean - 2*(MSE[idx]/np.sqrt(sample_size)), 3) )
    right_limit.append( np.round(sample_mean + 2*(MSE[idx]/np.sqrt(sample_size)), 3))

compute_CI()

print("confidence interval with mean squared error")
t = PrettyTable()
t = PrettyTable(["#samples", "Sample Size", "Sample mean", "Mean SE","Left C.I","Right C.I"])
for i in range(len(MSE)):
    row = []
    row.append(i+1)
    row.append(sample_size)
    row.append(np.round(sample_mean_list[i] , 3) )
    row.append(np.round(MSE[i] , 3) )
    row.append(left_limit[i])
    row.append(right_limit[i])
    
    t.add_row(row)
print(t)

confidence interval with mean squared error
+----------+-------------+-------------+---------+----------+-----------+
| #samples | Sample Size | Sample mean | Mean SE | Left C.I | Right C.I |
+----------+-------------+-------------+---------+----------+-----------+
|    1     |      30     |    84.098   |  78.562 |  55.411  |  112.784  |
|    2     |      30     |    71.497   |  86.048 |  40.076  |  102.917  |
|    3     |      30     |    60.873   |  84.787 |  29.914  |   91.833  |
|    4     |      30     |    64.021   | 114.937 |  22.052  |   105.99  |
|    5     |      30     |    69.853   |  91.281 |  36.522  |  103.184  |
|    6     |      30     |    78.986   |  77.314 |  50.755  |  107.217  |
|    7     |      30     |    91.12    |  85.374 |  59.946  |  122.294  |
|    8     |      30     |   102.072   | 104.494 |  63.917  |  140.228  |
|    9     |      30     |    67.221   |  79.085 |  38.344  |   96.099  |
|    10    |      30     |    56.779   |  93.411 |  22.67   |   90.8

In [43]:
sample_size = len(list_input_data[0])
left_limit_oob = []
right_limit_oob = []

#refrerence Central_Limit_theorem.ipynb
def compute_CI_for_OOB():
  left_limit.clear()
  right_limit.clear()

  for idx in range(len(OOB)):
    sample_mean = sample_mean_list[idx]
    sample_std = sample_std_list[idx]
    #left_limit.append( np.round(sample_mean - 2*(sample_std/np.sqrt(sample_size)), 3) )
    #right_limit.append( np.round(sample_mean + 2*(sample_std/np.sqrt(sample_size)), 3))
    left_limit_oob.append( np.round(sample_mean - 2*(OOB[idx]/np.sqrt(sample_size)), 3) )
    right_limit_oob.append( np.round(sample_mean + 2*(OOB[idx]/np.sqrt(sample_size)), 3))

compute_CI_for_OOB()

print("confidence interval with OOB score")
tb = PrettyTable()
tb = PrettyTable(["#samples", "Sample Size", "Sample mean", "OOB score","Left C.I","Right C.I"])
for i in range(len(OOB)):
    row = []
    row.append(i+1)
    row.append(sample_size)
    row.append(np.round(sample_mean_list[i] , 3) )
    row.append(np.round(OOB[i] , 3) )
    row.append(left_limit_oob[i])
    row.append(right_limit_oob[i])
    
    tb.add_row(row)
print(tb)

confidence interval with OOB score
+----------+-------------+-------------+-----------+----------+-----------+
| #samples | Sample Size | Sample mean | OOB score | Left C.I | Right C.I |
+----------+-------------+-------------+-----------+----------+-----------+
|    1     |      30     |    67.097   |  151.307  |  11.847  |  122.346  |
|    2     |      30     |    75.757   |  136.327  |  25.977  |  125.537  |
|    3     |      30     |    84.721   |  118.547  |  41.434  |  128.009  |
|    4     |      30     |    40.821   |   70.877  |  14.941  |   66.702  |
|    5     |      30     |    70.86    |  227.899  | -12.357  |  154.077  |
|    6     |      30     |    51.85    |  102.319  |  14.488  |   89.212  |
|    7     |      30     |    72.597   |  112.719  |  31.438  |  113.756  |
|    8     |      30     |    57.007   |  193.566  | -13.673  |  127.687  |
|    9     |      30     |    73.238   |   61.636  |  50.731  |   95.744  |
|    10    |      30     |    46.729   |   89.092  | 

In [49]:
xq= [0.18,20.0,5.00,0.0,0.421,5.60,72.2,7.95,7.0,30.0,19.1,372.13,18.60]
xq = np.array(xq).reshape(1,-1)

y_prediction = []

for idx , each_model in enumerate(list_of_models[0]):
  column_sampled = column_sampling(3)
  y_prediction.append( each_model.predict( xq[ 0: ,column_sampled ] ) )


y_pred_median = np.median(np.asarray( y_prediction ), axis = 0)
print("predicted price for given query point is = {0}".format(y_pred_median[0]) )

predicted price for given query point is = 23.799999999999997
