In [88]:
import numpy as np
import pandas as pd
import json

In [89]:
# Initialize the dataset
data = {'Outlook':['Sunny','Sunny','Overcast','Rain','Rain','Rain','Overcast','Sunny','Sunny','Rain','Sunny','Overcast','Overcast','Rain'],
       'Temperature':['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild'],
       'Humidity':['High','High','High','High','Normal','Normal','Normal','High','Normal','Normal','Normal','High','Normal','High'],
       'Wind':['Weak','Strong','Weak','Weak','Weak','Strong','Strong','Weak','Weak','Weak','Strong','Strong','Weak','Strong'],
       'PlayTennis':['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']}

In [90]:
# Create the dataframe
df = pd.DataFrame(data)
df

Unnamed: 0,Outlook,Temperature,Humidity,Wind,PlayTennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [91]:
# Try creating test dataset
train = df[0:10]
colNames = train.columns
nrow, ncol = train.shape
group = train.groupby(colNames[ncol-1])
rowNames = group.first().index
no = group.get_group(rowNames[0])
no

Unnamed: 0,Outlook,Temperature,Humidity,Wind
0,Sunny,Hot,High,Weak
1,Sunny,Hot,High,Strong
5,Rain,Cool,Normal,Strong
7,Sunny,Mild,High,Weak


In [92]:
# Test counting the train records that has a yes
nyes = (train['PlayTennis']=='Yes').sum()
nyes

6

In [93]:
# Test printing a record
test = df[10:15]
test.loc[10]

Outlook         Sunny
Temperature      Mild
Humidity       Normal
Wind           Strong
PlayTennis        Yes
Name: 10, dtype: object

In [110]:
# Node class for creating the Decision Tree
class Node:
    def __init__(self):
        self.childs = []
        self.value = ''
        self.name = ''
        self.by = ''
        
    # Add a child to the root
    def addChild(self, node):
        self.childs.append(node)

    # Convert the tree to Dict
    def toDict(root):
        if root.value != '':
            return root.value

        else:
            results = []
            for child in root.childs:
                value = Node.toDict(child)
                results.append({child.by: value})
            return {root.name: results}
        
    # Convert the tree to JSON
    def toJSON(root):
        return json.dumps(Node.toDict(root), indent=2)

In [111]:
# Calculate the entropy of a dataset
def entropy1(data):
    nrow, ncol = data.shape
    groups = data.groupby(data.columns[ncol-1])
    rowNames = groups.first().index
    H = 0

    for row in rowNames:
        group = groups.get_group(row)
        p = len(group) / nrow
        h = p * np.log2(p)
        H += h

    return -H

# Calculate the entropy of a dataset given a column
def entropy2(data, colName):
    groups = data.groupby(colName)
    rowNames = groups.first().index
    H = 0
    
    for row in rowNames:
        group = groups.get_group(row)
        p = len(group) / len(data)
        h = p * entropy1(group)
        H += h

    return H
    
# Calculate th information gains of all columns in a dataset
def infoGains(data):
    colNames = data.columns
    nrow, ncol = data.shape
    H = entropy1(data)
    infoGains = []

    for col in colNames:
        if col != colNames[ncol-1]:
            gain = H - entropy2(data, col)
            infoGains.append(gain)

    return infoGains    

In [112]:
infoGains(train)

[0.3219280948873623,
 0.09546184423832171,
 0.12451124978365313,
 0.0912774462416801]

In [113]:
# Create a decision tree
def decisionTree(data, root):
    nrow, ncol = data.shape
    if ncol > 1:
        colNames = data.columns

    # If the dataset is empty, return 'Unclassified'.
    if nrow == 0 or ncol == 0:
        root.value = 'Unclassified'

    # If there is only one column, return the element occured the most often.
    elif ncol == 1:
        uniqueVals = pd.unique(data)
        nmax = 0
        maxVal = ''

        for val in uniqueVals:
            n = (data == val).sum()
            if n > nmax:
                nmax = n
                maxVal = val

        root.value = maxVal

    # If the dataset is pure, return the value.
    elif entropy1(data) == 0:
        root.value = data[colNames[ncol-1]][data.index[0]]

    else:
        gains = infoGains(data)
        index = -1
        max = -1

        for i in range(ncol-1):
            if max < gains[i]:
                max = gains[i]
                index = i

        root.name = colNames[index]
        groups = data.groupby(root.name)
        rowNames = groups.first().index

        for row in rowNames:
            group = groups.get_group(row)
            child = Node()
            child = decisionTree(group, child)
            child.by = row
            root.addChild(child)

    return root

In [114]:
# Create a decision tree of the dataset
root = Node()
root = decisionTree(df, root)

In [115]:
print(root.name)
print('\t--', root.childs[0].by, '-->', root.childs[0].value)
print()
print('\t--', root.childs[1].by, '-->', root.childs[1].name)
print('\t\t--', root.childs[1].childs[0].by, '-->', root.childs[1].childs[0].value)
print('\t\t--', root.childs[1].childs[1].by, '-->', root.childs[1].childs[1].value)
print()
print('\t--', root.childs[2].by, '-->', root.childs[2].name)
print('\t\t--', root.childs[2].childs[0].by, '-->', root.childs[2].childs[0].value)
print('\t\t--', root.childs[2].childs[1].by, '-->', root.childs[2].childs[1].value)

Outlook
	-- Overcast --> Yes

	-- Rain --> Wind
		-- Strong --> No
		-- Weak --> Yes

	-- Sunny --> Humidity
		-- High --> No
		-- Normal --> Yes


In [116]:
jsdata = Node.toJSON(root
print(jsdata)


{
  "Outlook": [
    {
      "Overcast": "Yes"
    },
    {
      "Rain": {
        "Wind": [
          {
            "Strong": "No"
          },
          {
            "Weak": "Yes"
          }
        ]
      }
    },
    {
      "Sunny": {
        "Humidity": [
          {
            "High": "No"
          },
          {
            "Normal": "Yes"
          }
        ]
      }
    }
  ]
}
