# Import

In [5]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error

# Helper Functions

In [3]:
def create_leaf(y, ml_task):
    
    if ml_task == "regression":
        leaf = float(np.mean(y))
    else:
        counts = y.value_counts().reset_index()
        leaf = counts.iloc[0,0]
    
    return leaf


def get_potential_splits(data):
    
    X = data.drop(columns='target')
    potential_splits = {}
    columns = X.columns.tolist()
    for column in columns:

        values = X[[column]]
        unique_values = np.unique(values)
        
        potential_splits[column] = unique_values - 1
    
    return potential_splits


def calculate_gini(y):
    
    counts = y.value_counts().to_numpy()
    probabilities = counts / counts.sum()
    gini = np.sum(probabilities*(1-probabilities))
     
    return gini


def calculate_mse(y):
    
    if len(y) == 0:
        mse = 0
    else:
        mse = np.mean((y - np.mean(y)) **2)
    
    return mse


def total_impurity(data_left, data_right, metric_function):\

    n = len(data_left) + len(data_right)
    prop_left = len(data_left) / n
    prop_right = len(data_right) / n

    overall_metric =  (prop_left * metric_function(data_left['target']) 
                     + prop_right * metric_function(data_right['target']))
    
    return overall_metric


def split_data(data, column_types, split_column, split_value):
    
    type_of_feature = column_types[split_column]

    if type_of_feature == "continuous":
        data_left = data[data[split_column] <= split_value]
        data_right = data[data[split_column] >  split_value]
    
    else:
        data_left = data[data[split_column] == split_value]
        data_right = data[data[split_column] != split_value]
    
    return data_left, data_right


def determine_best_split(data, column_types, potential_splits, ml_task):

    best_overall_metric = np.inf
    for column, splits in potential_splits.items():
        for split in splits:
            
            data_left, data_right = split_data(data, column_types, split_column=column, split_value=split)
            
            if ml_task == "regression":
                node_impurity = total_impurity(data_left, data_right, metric_function=calculate_mse)
            else:
                node_impurity = total_impurity(data_left, data_right, metric_function=calculate_gini)
            
            if node_impurity <= best_overall_metric:
                best_overall_metric = node_impurity
                best_split_column = column
                best_split_value = split
    
    return best_split_column, best_split_value

# Algorithm

In [4]:
def decision_tree_algorithm(df, column_types, ml_task, min_samples=2, max_depth=5):
    
    leaves = []
    path = 'root'
    datasets = [(df,path)]
    split_conditions = []
    for current_depth in range(max_depth+1):
        next_set = []
        for dataset in datasets:
            data = dataset[0]
            path = dataset[1]
            
            if (len(data.target.unique()) == 1) or (len(data) < min_samples):
                leaf = create_leaf(data[['target']], ml_task)
                leaves.append((path,leaf))
                continue

            potential_splits = get_potential_splits(data)
            split_column, split_value = determine_best_split(data, column_types, potential_splits, ml_task)
            data_left, data_right = split_data(data, column_types, split_column, split_value)

            if len(data_left) == 0 or len(data_right) == 0:
                leaf = create_leaf(data[['target']], ml_task)
                leaves.append((path,leaf))
                continue
            print(len(data_left),len(data_right))
            split_conditions.append((path,split_column,split_value))
            next_set.append((data_left,path+',l'))
            next_set.append((data_right,path+',r'))

        datasets = next_set

    for dataset in datasets:
        data = dataset[0]
        path = dataset[1]
        leaf = create_leaf(data[['target']], ml_task)
        leaves.append((path,leaf))

    return leaves, split_conditions

# Make predictions with decision tree

def make_predictions(df, column_types, leaves, split_conditions):

    df['path'] = 'root'
    df['value'] = 0
    
    for split_condition in split_conditions:
        path = split_condition[0]
        column = split_condition[1]
        value = split_condition[2]

        if column_types[column] == "continuous":
            df.loc[(df['path']==path)&(df[column]<= value),'path'] = path+',l'
            df.loc[(df['path']==path)&(df[column]> value),'path'] = path+',r'
        else:
            df.loc[(df['path']==path)&(df[column]== value),'path'] = path+',l'
            df.loc[(df['path']==path)&(df[column]!= value),'path'] = path+',r'

    df['prediction'] = df['path'].map(dict(leaves))

    return df


def calculate_accuracy(df, column_types, ml_task, leaves, split_conditions):
    predictions = make_predictions(df, column_types, leaves, split_conditions).prediction
    
    if ml_task == 'regression':    
        predictions_array = predictions.values
        target_array = df.target.values
        metric = np.sqrt(sum((predictions_array - target_array)**2) / len(predictions_array))
        
    else:
        predictions_correct = predictions == df.target
        metric = predictions_correct.mean()
    
    return  metric

# Data Loading & Preprocessing

In [20]:
## Read csvs
train_df = pd.read_csv('500_Person_Gender_Height_Weight_Index.csv')

# Filling  NA
train_df = train_df.fillna(0)
train_df = train_df.rename(columns={'Height':'target'})

# categorical variable encoding
labelencoder = LabelEncoder()
train_df['Gender'] = labelencoder.fit_transform(train_df['Gender'])

# train-test split
train, test = train_test_split(train_df, test_size = 0.2, random_state=50)
column_types = {'Gender':'categorical','Weight':'continuous','Index':'categorical'}
ml_task = 'regression'

# Model Training

In [22]:
leaves, split_conditions = decision_tree_algorithm(train, column_types, ml_task, min_samples=2, max_depth=6)

19 381
11 8
96 285
4 7
3 5
36 60
48 237
1 3
2 5
2 1
4 1
11 25
17 43
20 28
10 227


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


1 2
2 3
3 1
6 5
13 12
14 3
25 18
9 11
26 2
4 6
24 203
1 1
2 1
4 2
3 2
8 5
7 5
6 8
2 1
14 11
13 5
1 8
3

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


 8
15 11
1 1
3 1
3 3
8 16
57 146
1 3
1 1
2 1
1 1
5 3
2 3
2 5
2 3
2

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


 4
5 3
9 5
10 1
6 7
3 2
4 4
2 1
6 2
7 8
6 5


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


1 2
1 2
1 7
11 5
25 32
58 88


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=

In [23]:
dict(leaves)

{'root,l,l,l,l': 168.0,
 'root,l,l,r,l': 179.0,
 'root,l,r,l,l': 194.0,
 'root,l,r,l,r': 195.0,
 'root,l,r,r,r': 196.0,
 'root,l,l,l,r,l': 178.0,
 'root,l,l,r,r,l': 180.0,
 'root,l,r,r,l,l': 189.66666666666666,
 'root,l,r,r,l,r': 188.0,
 'root,l,l,l,r,r,l': 170.0,
 'root,l,l,l,r,r,r': 176.0,
 'root,l,l,r,r,r,l': 186.0,
 'root,l,l,r,r,r,r': 185.0,
 'root,r,l,r,l,r,l': 176.5,
 'root,r,l,r,l,r,r': 181.0,
 'root,r,r,l,l,l,l': 150.0,
 'root,r,r,l,r,r,l': 192.0,
 'root,r,r,l,r,r,r': 199.0,
 'root,r,r,r,l,l,l': 190.33333333333334,
 'root,r,r,r,l,l,r': 198.0,
 'root,r,l,l,l,l,l,l': 144.0,
 'root,r,l,l,l,l,l,r': 148.33333333333334,
 'root,r,l,l,l,l,r,l': 142.0,
 'root,r,l,l,l,l,r,r': 146.0,
 'root,r,l,l,l,r,l,l': 153.5,
 'root,r,l,l,l,r,l,r': 153.0,
 'root,r,l,l,l,r,r,l': 144.0,
 'root,r,l,l,l,r,r,r': 154.0,
 'root,r,l,l,r,l,l,l': 164.6,
 'root,r,l,l,r,l,l,r': 153.33333333333334,
 'root,r,l,l,r,l,r,l': 148.5,
 'root,r,l,l,r,l,r,r': 163.66666666666666,
 'root,r,l,l,r,r,l,l': 174.5,
 'root,r,l,l,

# Prediction

In [24]:
predict = make_predictions(test, column_types, leaves, split_conditions)
predict

Unnamed: 0,Gender,target,Weight,Index,path,value,prediction
331,1,142,71,4,"root,r,l,l,l,l,r,r",0,146.000000
374,1,174,95,4,"root,r,l,l,r,l,r,r",0,163.666667
434,1,165,62,2,"root,r,r,l,l,r,r,l",0,171.500000
354,1,190,50,0,"root,r,r,r,l,l,r",0,198.000000
345,0,184,106,4,"root,r,l,r,l,l,l,l",0,179.500000
...,...,...,...,...,...,...,...
115,1,148,60,3,"root,r,r,r,r,l,l,r",0,150.857143
102,1,161,155,5,"root,r,r,r,r,r,r,r",0,166.306818
65,0,179,158,5,"root,r,r,r,r,r,r,r",0,166.306818
87,1,145,117,5,"root,r,r,r,r,r,r,l",0,160.206897


In [25]:
RMSE = calculate_accuracy(test, column_types, ml_task, leaves, split_conditions)
RMSE

10.787737305387969

# Comparison with sklearn

In [26]:
X_train = train[['Index','Weight','Gender']]
y_train = train[['target']]
X_test = test[['Index','Weight','Gender']]
y_test = test[['target']]

In [27]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train, sample_weight=None, check_input=True)
y_pred = clf.predict(X_test)
rmse_sklearn = np.sqrt(mean_squared_error(y_test, y_pred))
print('RMSE with decision tree: {:.2f}\nRMSE with decision tree in sklearn: {:.2f}'.format(RMSE,rmse_sklearn))

RMSE with decision tree: 10.79
RMSE with decision tree in sklearn: 13.82
