In [8]:
import numpy as np
import pandas as pd

In [9]:
data = {
    'Over170cm': ['N','Y','N','N','Y','N','Y','Y','Y','N','Y','Y','N'],
    'Eyecolor': ['Blue', 'Brown', 'Blue', 'Blue', 'Brown', 'Blue', 'Brown', 'Blue', 'Red', 'Red', 'Red', 'Red', 'Blue'],
    'Hairlength': ['short', 'long', 'long', 'short', 'short', 'long', 'short', 'long', 'short', 'short', 'long', 'short', 'long'],
    'sex': ['m', 'f', 'f', 'f', 'm', 'f', 'f', 'm', 'm', 'm', 'm', 'f', 'f'],
}
df = pd.DataFrame(data)

In [10]:
def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy_val = -np.sum([(counts[i]/np.sum(counts)) * np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy_val

In [11]:
def information_gain(data, split_attribute_name, target_name):
    total_entropy = entropy(data[target_name])

    vals, counts = np.unique(data[split_attribute_name], return_counts=True)

    weighted_entropy = np.sum([(counts[i]/np.sum(counts)) * entropy(data.where(data[split_attribute_name] == vals[i]).dropna()[target_name]) for i in range(len(vals))])
    
    info_gain = total_entropy - weighted_entropy
    return info_gain

In [12]:
def find_best_split(data, target_name, features):
    info_gains = {}
    for feature in features:
        info_gains[feature] = information_gain(data, feature, target_name)
    best_split = max(info_gains, key=info_gains.get)
    return best_split

In [13]:
def build_tree(data, target_name, features, depth=0, max_depth=None):
    if depth == max_depth or len(np.unique(data[target_name])) == 1:
        return np.unique(data[target_name])[0]
    else:
        best_split = find_best_split(data, target_name, features)
        tree = {best_split: {}}
        
        for value in np.unique(data[best_split]):
            sub_data = data.where(data[best_split] == value).dropna()
            subtree = build_tree(sub_data, target_name, features, depth + 1, max_depth)
            tree[best_split][value] = subtree
        return tree

In [14]:
target_name = 'sex'
features = ['Over170cm', 'Eyecolor', 'Hairlength']
max_depth = 3
decision_tree = build_tree(df, target_name, features, max_depth=max_depth)

import pprint
pprint.pprint(decision_tree)

{'Eyecolor': {'Blue': {'Over170cm': {'N': {'Hairlength': {'long': 'f',
                                                          'short': 'f'}},
                                     'Y': 'm'}},
              'Brown': {'Hairlength': {'long': 'f',
                                       'short': {'Over170cm': {'Y': 'f'}}}},
              'Red': {'Over170cm': {'N': 'm',
                                    'Y': {'Hairlength': {'long': 'm',
                                                         'short': 'f'}}}}}}
