In [None]:
import numpy as np
import pandas as pd

# Sample data
data = pd.DataFrame({
    'X': [1, 2, 3, 4, 5],
    'y': [10, 12, 20, 30, 45]
})

# Sort by X to simplify midpoint calculation
data = data.sort_values('X')

# Function to calculate Mean Squared Error
def mse(y):
    return np.mean((y - np.mean(y)) ** 2)




In [None]:
# Step 1: Find all possible split points (midpoints between unique sorted values)
X_values = data['X'].unique()
midpoints = [(X_values[i] + X_values[i + 1]) / 2 for i in range(len(X_values) - 1)]

print(f"All possible splits: {[float(x) for x in midpoints]}")

In [None]:
results = []

# Step 2 + 3: Try all splits and evaluate each (calculate total MSE)
for split in midpoints:
    left = data[data['X'] <= split]['y']
    right = data[data['X'] > split]['y']

    left_mse = mse(left)
    right_mse = mse(right)

    total_mse = (len(left) * left_mse + len(right) * right_mse) / len(data)

    results.append({
        'split': split,
        'left_samples': len(left),
        'right_samples': len(right),
        'left_mean': round(left.mean(), 2),
        'right_mean': round(right.mean(), 2),
        'left_mse': round(left_mse, 2),
        'right_mse': round(right_mse, 2),
        'total_mse': round(total_mse, 2)
    })

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
results_df.sort_values('total_mse')

In [None]:
# step 4 - Choose the best split
best_split = results_df.loc[results_df['total_mse'].idxmin(), 'split']
print(f"Best split: {best_split}")

In [None]:
# step 5 - repeat recursively
#           X <= 3.5
#       /             \
#      T               F
#     /                 \
# mean: 14              37.5
# X: 1,2,3              4,5
#



In [None]:
# Repeating for Left branch

In [None]:
data_l1 = pd.DataFrame({
    'X': [1, 2, 3],
    'y': [10, 12, 20]
})
# Step 1: Find all possible split points (midpoints between unique sorted values)
X_values_l1 = data_l1['X'].unique()
midpoints_l1 = [(X_values_l1[i] + X_values_l1[i + 1]) / 2 for i in range(len(X_values_l1) - 1)]

print(f"All possible splits: {[float(x) for x in midpoints_l1]}")

In [None]:
results_l1 = []

# Step 2 + 3: Try all splits and evaluate each (calculate total MSE)
for split in midpoints_l1:
    left = data_l1[data_l1['X'] <= split]['y']
    right = data_l1[data_l1['X'] > split]['y']

    left_mse = mse(left)
    right_mse = mse(right)

    total_mse = (len(left) * left_mse + len(right) * right_mse) / len(data_l1)

    results_l1.append({
        'split': split,
        'left_samples': len(left),
        'right_samples': len(right),
        'left_mean': round(left.mean(), 2),
        'right_mean': round(right.mean(), 2),
        'left_mse': round(left_mse, 2),
        'right_mse': round(right_mse, 2),
        'total_mse': round(total_mse, 2)
    })

# Convert results to DataFrame and display
results_df_l1 = pd.DataFrame(results_l1)
results_df_l1.sort_values('total_mse')

In [None]:
# step 4 - Choose the best split
best_split_l1 = results_df_l1.loc[results_df_l1['total_mse'].idxmin(), 'split']
print(f"Best split: {best_split_l1}")

In [None]:
# step 5 - repeat recursively
#                      X <= 3.5
#                  /             \
#                 T               F
#                /                 \
#             X <=2.5
#             /     \
#            T       F
#       [mean: 11]  [20]
#       [X: 1,2]    [3]

