# Import

In [73]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Helper Functions

In [27]:
def create_leaf(y, ml_task):
    
    if ml_task == "regression":
        leaf = float(np.mean(y))
    else:
        counts = y.value_counts().reset_index()
        leaf = counts.iloc[0,0]
    
    return leaf


def get_potential_splits(data):
    
    X = data.drop(columns='target')
    potential_splits = {}
    columns = X.columns.tolist()
    for column in columns:

        values = X[[column]]
        unique_values = np.unique(values)
        
        potential_splits[column] = unique_values - 1
    
    return potential_splits


def calculate_gini(y):
    
    counts = y.value_counts().to_numpy()
    probabilities = counts / counts.sum()
    gini = np.sum(probabilities*(1-probabilities))
     
    return gini


def calculate_mse(y):
    
    if len(y) == 0:
        mse = 0
    else:
        mse = np.mean((y - np.mean(y)) **2)
    
    return mse


def total_impurity(data_left, data_right, metric_function):\

    n = len(data_left) + len(data_right)
    prop_left = len(data_left) / n
    prop_right = len(data_right) / n

    overall_metric =  (prop_left * metric_function(data_left['target']) 
                     + prop_right * metric_function(data_right['target']))
    
    return overall_metric


def split_data(data, column_types, split_column, split_value):
    
    type_of_feature = column_types[split_column]

    if type_of_feature == "continuous":
        data_left = data[data[split_column] <= split_value]
        data_right = data[data[split_column] >  split_value]
    
    else:
        data_left = data[data[split_column] == split_value]
        data_right = data[data[split_column] != split_value]
    
    return data_left, data_right


def determine_best_split(data, column_types, potential_splits, ml_task):

    best_overall_metric = np.inf
    for column, splits in potential_splits.items():
        for split in splits:
            
            data_left, data_right = split_data(data, column_types, split_column=column, split_value=split)
            
            if ml_task == "regression":
                node_impurity = total_impurity(data_left, data_right, metric_function=calculate_mse)
            else:
                node_impurity = total_impurity(data_left, data_right, metric_function=calculate_gini)
            
            if node_impurity <= best_overall_metric:
                best_overall_metric = node_impurity
                best_split_column = column
                best_split_value = split
    
    return best_split_column, best_split_value

# Algorithm

In [28]:
def decision_tree_algorithm(df, column_types, ml_task, min_samples=2, max_depth=5):
    
    leaves = []
    path = 'root'
    datasets = [(df,path)]
    split_conditions = []
    for current_depth in range(max_depth+1):
        next_set = []
        for dataset in datasets:
            data = dataset[0]
            path = dataset[1]
            
            if (len(data.target.unique()) == 1) or (len(data) < min_samples):
                leaf = create_leaf(data[['target']], ml_task)
                leaves.append((path,leaf))
                continue

            potential_splits = get_potential_splits(data)
            split_column, split_value = determine_best_split(data, column_types, potential_splits, ml_task)
            data_left, data_right = split_data(data, column_types, split_column, split_value)

            if len(data_left) == 0 or len(data_right) == 0:
                leaf = create_leaf(data[['target']], ml_task)
                leaves.append((path,leaf))
                continue
            print(len(data_left),len(data_right))
            split_conditions.append((path,split_column,split_value))
            next_set.append((data_left,path+',l'))
            next_set.append((data_right,path+',r'))

        datasets = next_set

    for dataset in datasets:
        data = dataset[0]
        path = dataset[1]
        leaf = create_leaf(data[['target']], ml_task)
        leaves.append((path,leaf))

    return leaves, split_conditions

# Make predictions with decision tree

def make_predictions(df, column_types, leaves, split_conditions):

    df['path'] = 'root'
    df['value'] = 0
    
    for split_condition in split_conditions:
        path = split_condition[0]
        column = split_condition[1]
        value = split_condition[2]

        if column_types[column] == "continuous":
            df.loc[(df['path']==path)&(df[column]<= value),'path'] = path+',l'
            df.loc[(df['path']==path)&(df[column]> value),'path'] = path+',r'
        else:
            df.loc[(df['path']==path)&(df[column] == value),'path'] = path+',l'
            df.loc[(df['path']==path)&(df[column]!= value),'path'] = path+',r'

    df['prediction'] = df['path'].map(dict(leaves))

    return df


def calculate_accuracy(df, column_types, ml_task, leaves, split_conditions):
    predictions = make_predictions(df, column_types, leaves, split_conditions).prediction
    
    if ml_task == 'regression':    
        predictions_array = predictions.values
        target_array = df.target.values
        metric = np.sqrt(sum((predictions_array - target_array)**2) / len(predictions_array))
        
    else:
        predictions_correct = predictions == df.target
        metric = predictions_correct.mean()
    
    return  metric

# Data Loading

In [67]:
## Read csvs
train_df = pd.read_csv('500_Person_Gender_Height_Weight_Index.csv')

# Filling  NA
train_df = train_df.fillna(0)
train_df = train_df.rename(columns={'Index':'target'})

# categorical variable encoding
labelencoder = LabelEncoder()
train_df['Gender'] = labelencoder.fit_transform(train_df['Gender'])

# train-test split
train, test = train_test_split(train_df, test_size = 0.2, random_state=50)
column_types = {'Gender':'categorical','Height':'continuous','Weight':'continuous'}
ml_task = 'classifier'

# Model Training

In [68]:
leaves, split_conditions = decision_tree_algorithm(train, column_types, ml_task, min_samples=2, max_depth=6)

256 144
118 138
116 28
53 65
52 86
94 22
13 15
20 33
37 28
20 32
45 41
7 15
12 3
7 13
3 30
13 24
3 25
15 5
28 4
9 36
33 8
3 4
9 3
3 4
10 3
19 11
4 9
7 17
19 6
8 7
1 3
4 5
31 5
7 26
3 1
7 2
1 2
1 2
11 8
9 2
7 10
3 16
5 1
4 3
2 2
5 26
2 3
4 3
18 8
2 1
4 3


In [69]:
dict(leaves)

{'root,r,l,l': 5,
 'root,r,r,l': 4,
 'root,r,l,r,r': 5,
 'root,r,r,r,r': 4,
 'root,l,l,l,r,l': 5,
 'root,l,l,r,r,l': 3,
 'root,l,r,l,l,r': 4,
 'root,l,r,l,r,l': 5,
 'root,l,r,r,r,r': 4,
 'root,r,l,r,l,l': 4,
 'root,r,r,r,l,r': 5,
 'root,l,l,l,l,l,r': 3,
 'root,l,l,l,l,r,l': 2,
 'root,l,l,r,l,l,l': 1,
 'root,l,l,r,l,l,r': 0,
 'root,l,l,r,l,r,l': 2,
 'root,l,r,l,l,l,l': 5,
 'root,l,r,l,r,r,l': 4,
 'root,l,r,l,r,r,r': 5,
 'root,l,r,r,l,l,r': 3,
 'root,r,l,r,l,r,r': 4,
 'root,r,r,r,l,l,r': 4,
 'root,l,l,l,l,l,l,l': 3,
 'root,l,l,l,l,l,l,r': 2,
 'root,l,l,l,l,r,r,l': 0,
 'root,l,l,l,l,r,r,r': 2,
 'root,l,l,l,r,r,l,l': 4,
 'root,l,l,l,r,r,l,r': 4,
 'root,l,l,l,r,r,r,l': 3,
 'root,l,l,l,r,r,r,r': 4,
 'root,l,l,r,l,r,r,l': 1,
 'root,l,l,r,l,r,r,r': 1,
 'root,l,l,r,r,r,l,l': 2,
 'root,l,l,r,r,r,l,r': 2,
 'root,l,l,r,r,r,r,l': 3,
 'root,l,l,r,r,r,r,r': 2,
 'root,l,r,l,l,l,r,l': 4,
 'root,l,r,l,l,l,r,r': 5,
 'root,l,r,r,l,l,l,l': 4,
 'root,l,r,r,l,l,l,r': 2,
 'root,l,r,r,l,r,l,l': 4,
 'root,l,r,r

In [70]:
split_conditions

[('root', 'Weight', 122),
 ('root,l', 'Weight', 84),
 ('root,r', 'Height', 185),
 ('root,l,l', 'Height', 166),
 ('root,l,r', 'Height', 164),
 ('root,r,l', 'Height', 178),
 ('root,r,r', 'Weight', 139),
 ('root,l,l,l', 'Weight', 65),
 ('root,l,l,r', 'Weight', 69),
 ('root,l,r,l', 'Weight', 97),
 ('root,l,r,r', 'Height', 181),
 ('root,r,l,r', 'Weight', 139),
 ('root,r,r,r', 'Height', 196),
 ('root,l,l,l,l', 'Height', 152),
 ('root,l,l,l,r', 'Height', 141),
 ('root,l,l,r,l', 'Weight', 55),
 ('root,l,l,r,r', 'Height', 172),
 ('root,l,r,l,l', 'Height', 155),
 ('root,l,r,l,r', 'Height', 160),
 ('root,l,r,r,l', 'Weight', 95),
 ('root,l,r,r,r', 'Weight', 114),
 ('root,r,l,r,l', 'Weight', 129),
 ('root,r,r,r,l', 'Weight', 150),
 ('root,l,l,l,l,l', 'Weight', 59),
 ('root,l,l,l,l,r', 'Height', 162),
 ('root,l,l,l,r,r', 'Height', 160),
 ('root,l,l,r,l,l', 'Height', 180),
 ('root,l,l,r,l,r', 'Height', 178),
 ('root,l,l,r,r,r', 'Weight', 82),
 ('root,l,r,l,l,l', 'Height', 145),
 ('root,l,r,l,r,r', 'W

# Prediction

In [71]:
predict = make_predictions(test, column_types, leaves, split_conditions)
predict

Unnamed: 0,Gender,Height,Weight,target,path,value,prediction
331,1,142,71,4,"root,l,l,l,r,r,l,l",0,4
374,1,174,95,4,"root,l,r,r,l,l,r",0,3
434,1,165,62,2,"root,l,l,l,l,r,r,r",0,2
354,1,190,50,0,"root,l,l,r,l,l,r",0,0
345,0,184,106,4,"root,l,r,r,r,l,r,l",0,3
...,...,...,...,...,...,...,...
115,1,148,60,3,"root,l,l,l,l,l,r",0,3
102,1,161,155,5,"root,r,l,l",0,5
65,0,179,158,5,"root,r,l,r,r",0,5
87,1,145,117,5,"root,l,r,l,r,l",0,5


In [72]:
accuracy = calculate_accuracy(test, column_types, ml_task, leaves, split_conditions)
accuracy

0.81

# Comparison with sklearn

In [75]:
X_train = train[['Height','Weight','Gender']]
y_train = train[['target']]
X_test = test[['Height','Weight','Gender']]
y_test = test[['target']]


In [77]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train, sample_weight=None, check_input=True)
y_pred = clf.predict(X_test)
accuracy_sklearn = accuracy_score(y_test, y_pred)
print('Accuracy with decision tree: {:.2f}\nAccuracy with decision tree in sklearn: {:.2f}'.format(accuracy,accuracy_sklearn))

Accuracy with decision tree: 0.81
Accuracy with decision tree in sklearn: 0.85
