Generate a random dataset of 20 Samples.  Each sample should have two inputs and 1 output label either 0 or 1.

In [1]:
# imports 
import random
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
 


In [2]:
# generate random data
np.random.seed(23)

# dataset of 2 features and y as the target
X = np.random.rand(20, 2)
y = np.random.choice([0, 1], size=(20, 1))


# dataset with 2 features and y as the target
data_set = np.concatenate((X, y.reshape(-1, 1)), axis=1)

print(data_set)


[[5.17297884e-01 9.46962604e-01 1.00000000e+00]
 [7.65459759e-01 2.82395844e-01 1.00000000e+00]
 [2.21045363e-01 6.86222085e-01 0.00000000e+00]
 [1.67139203e-01 3.92442466e-01 1.00000000e+00]
 [6.18052347e-01 4.11930095e-01 0.00000000e+00]
 [2.46488120e-03 8.84032182e-01 1.00000000e+00]
 [8.84947538e-01 3.00409689e-01 0.00000000e+00]
 [5.89581865e-01 9.78426916e-01 1.00000000e+00]
 [8.45093822e-01 6.50754391e-02 1.00000000e+00]
 [2.94744465e-01 2.87934441e-01 0.00000000e+00]
 [8.22466339e-01 6.26183038e-01 1.00000000e+00]
 [1.10477714e-01 5.28811169e-04 1.00000000e+00]
 [9.42166233e-01 1.41500758e-01 1.00000000e+00]
 [4.21596526e-01 3.46489440e-01 0.00000000e+00]
 [8.69785084e-01 4.28601812e-01 0.00000000e+00]
 [8.28751484e-01 7.17851838e-01 0.00000000e+00]
 [1.19226694e-01 5.96384173e-01 1.00000000e+00]
 [1.29756298e-01 7.75340917e-02 0.00000000e+00]
 [8.31205256e-01 4.64385615e-01 1.00000000e+00]
 [1.62012479e-01 5.47975292e-01 0.00000000e+00]]


Output the log-odds of the dataset. This will be the predicted output of the 0th decision tree

In [3]:
# output the log-odds of the dataset 

def log_odds(data_set):
    # calculate the log-odds
    # note p is the probability of the target being 1
    p = np.mean(data_set[:, -1])
    log_odds = np.log(p / (1 - p))
    return log_odds

print(log_odds(data_set))


    

0.2006706954621514


 Calculate and output the residual terms for each training data.

In [4]:
# calculate the residuals of the dataset
def residuals(data_set):
    # calculate the residuals
    log_odds_value = log_odds(data_set)
    residuals = data_set[:, -1] - log_odds_value
    return residuals

for i in range(0, 20):
    print(f"Sample {i}: {residuals(data_set)[i]}")

Sample 0: 0.7993293045378486
Sample 1: 0.7993293045378486
Sample 2: -0.2006706954621514
Sample 3: 0.7993293045378486
Sample 4: -0.2006706954621514
Sample 5: 0.7993293045378486
Sample 6: -0.2006706954621514
Sample 7: 0.7993293045378486
Sample 8: 0.7993293045378486
Sample 9: -0.2006706954621514
Sample 10: 0.7993293045378486
Sample 11: 0.7993293045378486
Sample 12: 0.7993293045378486
Sample 13: -0.2006706954621514
Sample 14: -0.2006706954621514
Sample 15: -0.2006706954621514
Sample 16: 0.7993293045378486
Sample 17: -0.2006706954621514
Sample 18: 0.7993293045378486
Sample 19: -0.2006706954621514


Fit a decision tree (decision tree 1) to the residuals. Fix the max depth of tree to 2(i.e., root is at level 0 and the leaves are at a maximum depth of 2). Output γj1 for each leaf node


Choose at least two samples from each leaf node and output their predicted values
using decision tree 1



In [5]:
# fit a decision tree to the residuals
X = data_set[:, :-1]
y = residuals(data_set)

# fit a decision tree to the residuals
model = DecisionTreeRegressor(max_depth=2)
model.fit(X, y)

# output the residuals for each leaf node
leaf_nodes = model.apply(X)
leaf_values = {}

# output the residuals for each leaf node
for i in np.unique(leaf_nodes):
    avg_residual = np.mean(y[leaf_nodes == i])
    leaf_values[i] = avg_residual
    
    print(f"Leaf {i}: {avg_residual}")
    

# (e) Choose at least two samples from each leaf node and output their predicted values
# using decision tree 1
for leaf_index, leaf_value in leaf_values.items():
    # select two samples from the leaf node
    samples = np.random.choice(np.where(leaf_nodes == leaf_index)[0], 2)
    for sample in samples:
        print(f"Leaf {leaf_index}, Sample {sample}: {leaf_value}")

Leaf 1: 0.7993293045378486
Leaf 3: -0.03400402879548476
Leaf 4: 0.4356929409014849
Leaf 1, Sample 5: 0.7993293045378486
Leaf 1, Sample 5: 0.7993293045378486
Leaf 3, Sample 13: -0.03400402879548476
Leaf 3, Sample 3: -0.03400402879548476
Leaf 4, Sample 15: 0.4356929409014849
Leaf 4, Sample 7: 0.4356929409014849


continue the process and train 9 more decision trees. For each decision tree k, output the values γjk corresponding to the leaf nodes.

For an example sample, show (in your python file) how you can predict using the decision trees combine their results using the γjk values

In [18]:
# train nine more decision trees on the residuals
models = []

for i in range(1,10):
    
    r = residuals(data_set) - model.predict(X)
    
    new_tree = DecisionTreeRegressor(max_depth=2)
    
    new_tree.fit(X, r)
    
    models.append(new_tree)
    
    leaf_nodes = new_tree.apply(X)
    
    for leaf_index in np.unique(leaf_nodes):
        avg_residual = np.mean(r[leaf_nodes == leaf_index])
        leaf_values[leaf_index] = avg_residual
        
        print(f"Leaf {leaf_index}: {avg_residual}")
        
        
# predict using the decision trees and combine the results using the residuals
predictions = np.zeros(data_set.shape[0])

for model in models:
    predictions += model.predict(X)
    
final_predictions = log_odds(data_set) + predictions

print(final_predictions)

        

Leaf 2: 0.012121212121212132
Leaf 3: 0.36363636363636365
Leaf 5: -0.6363636363636364
Leaf 6: 0.36363636363636365
Leaf 2: 0.012121212121212132
Leaf 3: 0.36363636363636365
Leaf 5: 0.36363636363636365
Leaf 6: -0.6363636363636364
Leaf 2: 0.012121212121212132
Leaf 3: 0.36363636363636365
Leaf 5: -0.6363636363636364
Leaf 6: 0.36363636363636365
Leaf 2: 0.012121212121212132
Leaf 3: 0.36363636363636365
Leaf 5: -0.6363636363636364
Leaf 6: 0.36363636363636365
Leaf 2: 0.012121212121212132
Leaf 3: 0.36363636363636365
Leaf 5: 0.36363636363636365
Leaf 6: -0.6363636363636364
Leaf 2: 0.012121212121212132
Leaf 3: 0.36363636363636365
Leaf 5: 0.36363636363636365
Leaf 6: -0.6363636363636364
Leaf 2: 0.012121212121212132
Leaf 3: 0.36363636363636365
Leaf 5: 0.36363636363636365
Leaf 6: -0.6363636363636364
Leaf 2: 0.012121212121212132
Leaf 3: 0.36363636363636365
Leaf 5: -0.6363636363636364
Leaf 6: 0.36363636363636365
Leaf 2: 0.012121212121212132
Leaf 3: 0.36363636363636365
Leaf 5: 0.36363636363636365
Leaf 6: -0.