# Import

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Helper Functions

In [2]:
def create_leaf(y, ml_task):
    
    if ml_task == "regression":
        leaf = float(np.mean(y))
    else:
        counts = y.value_counts().reset_index()
        leaf = counts.iloc[0,0]
    
    return leaf


def get_potential_splits(data):
    
    X = data.drop(columns='target')
    potential_splits = {}
    columns = X.columns.tolist()
    for column in columns:

        values = X[[column]]
        unique_values = np.unique(values)
        
        potential_splits[column] = unique_values - 1
    
    return potential_splits


def calculate_gini(y):
    
    counts = y.value_counts().to_numpy()
    probabilities = counts / counts.sum()
    gini = np.sum(probabilities*(1-probabilities))
     
    return gini


def calculate_mse(y):
    
    if len(y) == 0:
        mse = 0
    else:
        mse = np.mean((y - np.mean(y)) **2)
    
    return mse


def total_impurity(data_left, data_right, metric_function):\

    n = len(data_left) + len(data_right)
    prop_left = len(data_left) / n
    prop_right = len(data_right) / n

    overall_metric =  (prop_left * metric_function(data_left['target']) 
                     + prop_right * metric_function(data_right['target']))
    
    return overall_metric


def split_data(data, column_types, split_column, split_value):
    
    type_of_feature = column_types[split_column]

    if type_of_feature == "continuous":
        data_left = data[data[split_column] <= split_value]
        data_right = data[data[split_column] >  split_value]
    
    else:
        data_left = data[data[split_column] == split_value]
        data_right = data[data[split_column] != split_value]
    
    return data_left, data_right


def determine_best_split(data, column_types, potential_splits, ml_task):

    best_overall_metric = np.inf
    for column, splits in potential_splits.items():
        for split in splits:
            
            data_left, data_right = split_data(data, column_types, split_column=column, split_value=split)
            
            if ml_task == "regression":
                node_impurity = total_impurity(data_left, data_right, metric_function=calculate_mse)
            else:
                node_impurity = total_impurity(data_left, data_right, metric_function=calculate_gini)
            
            if node_impurity <= best_overall_metric:
                best_overall_metric = node_impurity
                best_split_column = column
                best_split_value = split
    
    return best_split_column, best_split_value

# Algorithm

In [12]:
def decision_tree_algorithm(df, column_types, ml_task, min_samples=2, max_depth=5):
    
    leaves = []
    path = 'root'
    datasets = [(df,path)]
    split_conditions = []
    for current_depth in range(max_depth+1):
        next_set = []
        for dataset in datasets:
            data = dataset[0]
            path = dataset[1]
            
            if (len(data.target.unique()) == 1) or (len(data) < min_samples):
                leaf = create_leaf(data[['target']], ml_task)
                leaves.append((path,leaf))
                continue

            potential_splits = get_potential_splits(data)
            split_column, split_value = determine_best_split(data, column_types, potential_splits, ml_task)
            data_left, data_right = split_data(data, column_types, split_column, split_value)

            if len(data_left) == 0 or len(data_right) == 0:
                leaf = create_leaf(data[['target']], ml_task)
                leaves.append((path,leaf))
                continue
            print(len(data_left),len(data_right))
            split_conditions.append((path,split_column,split_value))
            next_set.append((data_left,path+',l'))
            next_set.append((data_right,path+',r'))

        datasets = next_set

    for dataset in datasets:
        data = dataset[0]
        path = dataset[1]
        leaf = create_leaf(data[['target']], ml_task)
        leaves.append((path,leaf))

    return leaves, split_conditions

# Make predictions with decision tree

def make_predictions(df, column_types, leaves, split_conditions):

    df['path'] = 'root'
    df['value'] = 0
    
    for split_condition in split_conditions:
        path = split_condition[0]
        column = split_condition[1]
        value = split_condition[2]

        if column_types[column] == "continuous":
            df.loc[(df['path']==path)&(df[column]<= value),'path'] = path+',l'
            df.loc[(df['path']==path)&(df[column]> value),'path'] = path+',r'
        else:
            df.loc[(df['path']==path)&(df[column]== value),'path'] = path+',l'
            df.loc[(df['path']==path)&(df[column]!= value),'path'] = path+',r'

    df['prediction'] = df['path'].map(dict(leaves))

    return df


def calculate_accuracy(df, column_types, ml_task, leaves, split_conditions):
    predictions = make_predictions(df, column_types, leaves, split_conditions).prediction
    
    if ml_task == 'regression':    
        predictions_array = predictions.values
        target_array = df.target.values
        metric = np.sqrt(sum((predictions_array - target_array)**2) / len(predictions_array))
        
    else:
        predictions_correct = predictions == df.target
        metric = predictions_correct.mean()
    
    return  metric

# Data Loading & Preprocessing

In [31]:
## Read csvs
train_df = pd.read_csv('500_Person_Gender_Height_Weight_Index.csv', index_col=0)

In [32]:
train_df = train_df.fillna(0)
train_df = train_df.rename(columns={'Height':'target'})
train, val = train_test_split(train_df, test_size = 0.2)
column_types = {'Gender':'categorical','Weight':'continuous','Index':'categorical'}
ml_task = 'regression'

# Model Training

In [33]:
leaves, split_conditions = decision_tree_algorithm(train, column_types, ml_task, min_samples=2, max_depth=6)

17 383
9 8
7 376
1 8
6 2
56 320
3

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


 5
2 4
22 34
96 224
2 1
2 3
2 2
8 14
13 21
35

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


 61
58 166
1 1
2 1
3 5
9 5
9 4
20 1
14 21
23

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


 38
26 32
77 89
1 2
2 3
8 1
4 1
2 7
1 3
15 5
11 3
12 9
8

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


 15
35 3
12 14
16 16
3 74
14 75


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=

In [34]:
dict(leaves)

{'root,r,l': 191.0,
 'root,l,l,l': 168.0,
 'root,l,r,r': 195.5,
 'root,l,r,l,l': 194.0,
 'root,l,l,r,l,r': 179.0,
 'root,l,l,r,r,l': 186.0,
 'root,l,r,l,r,l': 191.0,
 'root,l,r,l,r,r': 190.5,
 'root,l,l,r,l,l,l': 178.0,
 'root,l,l,r,l,l,r': 176.0,
 'root,l,l,r,r,r,l': 180.0,
 'root,l,l,r,r,r,r': 185.0,
 'root,r,r,l,r,r,r': 168.0,
 'root,r,r,l,l,l,l,l': 150.0,
 'root,r,r,l,l,l,l,r': 149.0,
 'root,r,r,l,l,l,r,l': 151.0,
 'root,r,r,l,l,l,r,r': 152.33333333333334,
 'root,r,r,l,l,r,l,l': 161.125,
 'root,r,r,l,l,r,l,r': 162.0,
 'root,r,r,l,l,r,r,l': 167.0,
 'root,r,r,l,l,r,r,r': 170.0,
 'root,r,r,l,r,l,l,l': 178.5,
 'root,r,r,l,r,l,l,r': 180.42857142857142,
 'root,r,r,l,r,l,r,l': 187.0,
 'root,r,r,l,r,l,r,r': 182.33333333333334,
 'root,r,r,l,r,r,l,l': 189.0,
 'root,r,r,l,r,r,l,r': 195.0,
 'root,r,r,r,l,l,l,l': 147.0,
 'root,r,r,r,l,l,l,r': 152.33333333333334,
 'root,r,r,r,l,l,r,l': 158.16666666666666,
 'root,r,r,r,l,l,r,r': 164.66666666666666,
 'root,r,r,r,l,r,l,l': 171.75,
 'root,r,r,r,l,r,

# Prediction

In [35]:
predict = make_predictions(val, column_types, leaves, split_conditions)
predict

Unnamed: 0_level_0,target,Weight,Index,path,value,prediction
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,164,160,5,"root,r,r,r,r,r,r,r",0,164.853333
Male,193,130,4,"root,r,r,r,l,r,r,l",0,185.685714
Male,151,154,5,"root,r,r,r,r,r,r,r",0,164.853333
Female,141,136,5,"root,r,r,r,r,r,l,r",0,156.959459
Female,154,96,5,"root,r,r,r,r,r,l,r",0,156.959459
...,...,...,...,...,...,...
Male,155,57,2,"root,r,r,r,r,l,l,r",0,165.500000
Male,187,62,1,"root,l,r,l,r,l",0,191.000000
Female,183,96,3,"root,r,r,l,r,r,l,l",0,189.000000
Male,144,108,5,"root,r,r,r,r,r,l,r",0,156.959459


In [36]:
RMSE = calculate_accuracy(val, column_types, ml_task, leaves, split_conditions)
RMSE

12.307771159922709