In [None]:
import numpy as np
import pandas as pd
import graphviz
import pydotplus
from IPython.display import Image

In [None]:
def entropy(target_col):
  elements, counts = np.unique(target_col, return_counts=True)
  entropy = -np.sum([(counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
  return entropy

In [None]:
print(entropy(data['Temperature']))
print(entropy(data['Temperature']))
print(entropy(data['Temperature']))


1.584962500721156


In [None]:
def infoGain(data, split_attribute_name, target_name="class"):
    total_entropy = entropy(data[target_name])
    vals, counts= np.unique(data[split_attribute_name], return_counts=True)
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

In [None]:
print(infoGain(data,'Temperature','Temperature'))
print(infoGain(data,'Wind','Wind'))
print(infoGain(data,'Humidity','Humidity'))

1.584962500721156
0.9182958340544896
1.0


In [None]:
def split(data, originaldata, features, target_attribute_name="class", parent_node_class=None):
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    elif len(data) == 0:
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name], return_counts=True)[1])]
    elif len(features) == 0:
        return parent_node_class
    else:
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1])]
        item_values = [infoGain(data, feature, target_attribute_name) for feature in features]
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        tree = {best_feature: {}}
        features = [i for i in features if i != best_feature]
        for value in np.unique(data[best_feature]):
            value = value
            sub_data = data.where(data[best_feature] == value).dropna()
            subtree = split(sub_data, originaldata, features, target_attribute_name, parent_node_class)
            tree[best_feature][value] = subtree
        return tree

In [None]:
def predict(query, tree, default=1):
    for key in list(query.keys()):
        if key in list(tree.keys()):
            try:
                result = tree[key][query[key]]
            except:
                return default
            result = tree[key][query[key]]
            if isinstance(result, dict):
                return predict(query, result)
            else:
                return result


In [None]:
data = pd.read_csv('data.csv')

features = data.columns[:-1].tolist()
target = data.columns[-1]
tree = split(data, data, features, target)

# Test data
queries = pd.DataFrame([
    ['Hot', 'High', 'Weak'],
    ['Mild', 'Normal', 'Strong'],
    ['Cool','Normal', 'Weak']
], columns=['Temperature', 'Humidity', 'Wind'])

predictions = []
for i in range(len(queries)):
    predictions.append(predict(queries.iloc[i], tree, 1))
print(predictions)


def print_tree(tree, dot_object=None, parent_node=None, edge_label=''):
    if dot_object is None:
        dot_object = graphviz.Digraph()
        dot_object.node(name=str(tree))
    elif not isinstance(tree, dict):
        leaf_node = str(tree)
        dot_object.node(name=leaf_node)
        dot_object.edge(parent_node, leaf_node, label=edge_label)
    else:
        for node, subtree in tree.items():
            if parent_node is not None:
                dot_object.edge(parent_node, node, label=edge_label)
            if isinstance(subtree, dict):
                for value, subsubtree in subtree.items():
                    print_tree(subsubtree, dot_object, node, str(value))
            else:
                print_tree(subtree, dot_object, node, str(value))
    return dot_object

dot_object = print_tree(tree)
dot_object.view()


['No', 'No', 'Yes']


'Digraph.gv.pdf'