In [2]:
ASSETS_DIR = "/content/drive/MyDrive/Colab Notebooks/assets"

In [251]:
import numpy as np
import pandas as pd

from dataclasses import dataclass, field

from pprint import pprint

from typing import List, Any, Dict

In [423]:
@dataclass
class Node:
  indexes: List[int] = field(default_factory=list)
  childs: List[Node] = field(default_factory=list)
  branch: float = None
  key: str = ""


  def __show(self, indexes, branch, childs, key, r=0, c=0, x=0):
    print(c * '\t', indexes, end=" ")
    print("#" + str(branch) if branch else "", end=" ")
    print("@" + str(key) if key else "", end="\n")

    if len(childs) == 0:
      return
    else:
      c += 1

    for i in childs:
      r += 1
      x = c * r + 1
      self.__show(i.indexes, i.branch, i.childs, i.key, r, c, x)

  def show(self):
    self.__show(self.indexes, self.branch, self.childs, self.key)

In [5]:
def entropy(s):
    p = -s.value_counts() \
          .div(s.count()) \
          .agg(lambda x: x*np.log2(x)) \
          .sum()

    return p

In [149]:
def gain(s, col, target):
    p_target = entropy(s[target])
    keys = s[col].drop_duplicates()
    entropies = keys.apply(lambda _: (_, entropy(s[target][s[col][s[col] == _].index])))
    ratios = s[col].value_counts().div(len(s))
    res = np.subtract(p_target, entropies.apply(lambda _: ratios.loc[_[0]] * _[1]).sum())

    return col, res

In [409]:
def id3(s, *, df=None, target, exclude, verbose=False):
    node = Node()

    # stopping case
    if df is not None and len(df) == 0:
        return

    # store first dataframe
    if df is None:
        df = s.copy()

    node.indexes = list(s.index)

    # remove excluded columns from dataframe
    if len(exclude) > 0:
        s = s.loc[:, ~s.columns.isin(exclude)]

    columns = s.columns.drop(labels=[target])

    gains = columns.map(lambda x: gain(s, x, target)) \
                   .to_frame(index=False) \
                   .set_index(0)

    if verbose:
      print("\nSET:\n\n", df.loc[s.index])
      print("\nGAINS: \n", gains)

    if gains.empty or len(df[target].value_counts()) == 1:
        if verbose:
          print("\n\nLEAF<<<\n\n")
        node.branch = df[target].values[0]
        return node

    next_branch = gains.idxmax().values[0]
    node.branch = next_branch

    if verbose:
      print("\nNEXT BRANCH: ", next_branch)

    next_S = df[next_branch].drop_duplicates() \
                            .apply(lambda _: df.loc[df[next_branch][df[next_branch] == _] \
                                              .index])

    for k, v in enumerate(next_S):
      if verbose:
        print(f"epoch @{k}")
      #print(v[next_branch].values[0])
      n = id3(v, df=df.loc[v.index], target=target, exclude=[*exclude, next_branch])
      n.key = v[next_branch].values[0]
      node.childs.append(n)

    return node


In [424]:
df = pd.read_csv(ASSETS_DIR + "/weather.csv")
target = "play" #@param {type: "string"}
exclude = ["id"] #@param

# 1st epoch
tree = id3(df, exclude=exclude, target=target)
tree.show()

 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] #outlook 
	 [0, 1, 7, 8, 10] #humidity @sunny
		 [0, 1, 7] #no @high
		 [8, 10] #yes @normal
	 [2, 6, 11, 12] #yes @overcast
	 [3, 4, 5, 9, 13] #windy @rainy
		 [3, 4, 9] #yes 
		 [5, 13] #no @True


In [425]:
df = pd.read_csv(ASSETS_DIR + "/job.csv")
target = "risk" #@param {type: "string"}
exclude = ["customer"] #@param

# 1st epoch
tree = id3(df, exclude=exclude, target=target, verbose=False)
tree.show()

 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] #debt 
	 [0, 1, 2] #Bad @High
	 [3, 4, 5, 6, 7, 8, 9] #revenue @Low
		 [3, 4, 7, 8] #status @Low
			 [3, 7] #Good @Employee
			 [4, 8] #Bad @Employeer
		 [5, 6, 9] #Good @High


In [427]:
df = pd.read_csv(ASSETS_DIR + "/monks.csv")
target = "has_tie" #@param {type: "string"}
exclude = [] #@param

# 1st epoch
tree = id3(df, exclude=exclude, target=target)
tree.show()

 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221

In [426]:
df = pd.read_csv(ASSETS_DIR + "/tennis.csv")
target = "play" #@param {type: "string"}
exclude = [] #@param

# 1st epoch
tree = id3(df, exclude=exclude, target=target)
tree.show()

 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] #outlook 
	 [0, 1, 7, 8, 10] #humidity @sunny
		 [0, 1, 7] #no @high
		 [8, 10] #yes @normal
	 [2, 6, 11, 12] #yes @overcast
	 [3, 4, 5, 9, 13] #windy @rainy
		 [3, 4, 9] #yes @weak
		 [5, 13] #no @strong
