In [2]:
ASSETS_DIR = "/content/drive/MyDrive/Colab Notebooks/assets"

In [6]:
import numpy as np
import pandas as pd

In [5]:
def entropy(s):
    p = -s.value_counts() \
          .div(s.count()) \
          .agg(lambda x: x*np.log2(x)) \
          .sum()

    return p

In [7]:
def gain(s, col, target):
    p_target = entropy(s[target])
    keys = s[col].drop_duplicates()
    entropies = keys.apply(lambda _: (_, entropy(s[target][s[col][s[col] == _].index])))
    ratios = s[col].value_counts().div(len(s))
    res = np.subtract(p_target, entropies.apply(lambda _: ratios.loc[_[0]] * _[1]).sum())

    return col, res

In [121]:
def id3(s, *, df=None, target, exclude, verbose=True):

    # stopping case
    if df is not None and len(df) == 0:
        return

    # store first dataframe
    if df is None:
        df = s.copy()

    # remove excluded columns from dataframe
    if len(exclude) > 0:
        s = s.loc[:, ~s.columns.isin(exclude)]

    columns = s.columns.drop(labels=[target])

    gains = columns.map(lambda x: gain(s, x, target)) \
                 .to_frame(index=False) \
                 .set_index(0)

    print("\nGAINS: ", gains)

    if gains.empty or np.float64(gains.mean()) == 0:
        print("leaf<<<")
        return

    next_branch = gains.idxmax().values[0]

    print("\nNEXT BRANCH: ", next_branch)

    next_S = df[next_branch].drop_duplicates() \
                            .apply(lambda _: df.loc[df[next_branch][df[next_branch] == _] \
                                              .index])

    for k, v in enumerate(next_S):
      print(f"epoch @{k}")
      print(v)
      input(">>>")
      id3(v, df=df.loc[v.index], target=target, exclude=[*exclude, next_branch])


In [111]:
df = pd.read_csv(ASSETS_DIR + "/weather.csv")
target = "play" #@param {type: "string"}
exclude = ["id"] #@param

# 1st epoch
id3(df, exclude=exclude, target=target)


GAINS:                      1
0                    
outlook      0.246750
temperature  0.029223
humidity     0.151836
windy        0.048127

NEXT BRANCH:  outlook
epoch @0
    id outlook temperature humidity  windy play
0    1   sunny         hot     high  False   no
1    2   sunny         hot     high   True   no
7    8   sunny        mild     high  False   no
8    9   sunny        cool   normal  False  yes
10  11   sunny        mild   normal   True  yes
>>>

GAINS:                      1
0                    
temperature  0.570951
humidity     0.970951
windy        0.019973

NEXT BRANCH:  humidity
epoch @0
   id outlook temperature humidity  windy play
0   1   sunny         hot     high  False   no
1   2   sunny         hot     high   True   no
7   8   sunny        mild     high  False   no
>>>

GAINS:                 1
0               
temperature -0.0
windy       -0.0
leaf<<<
epoch @1
    id outlook temperature humidity  windy play
8    9   sunny        cool   normal  False  yes
1

In [123]:
df = pd.read_csv(ASSETS_DIR + "/job.csv")
target = "risk" #@param {type: "string"}
exclude = ["customer"] #@param

# 1st epoch
id3(df, exclude=exclude, target=target)


GAINS:                  1
0                
debt     0.395816
revenue  0.029049
status   0.029049

NEXT BRANCH:  debt
epoch @0
   customer  debt revenue     status risk
0         1  High    High  Employeer  Bad
1         2  High    High   Employee  Bad
2         3  High     Low   Employee  Bad
>>>

GAINS:             1
0           
revenue -0.0
status  -0.0
leaf<<<
epoch @1
   customer debt revenue     status  risk
3         4  Low     Low   Employee  Good
4         5  Low     Low  Employeer   Bad
5         6  Low    High  Employeer  Good
6         7  Low    High   Employee  Good
7         8  Low     Low   Employee  Good
8         9  Low     Low  Employeer   Bad
9        10  Low    High  Employeer  Good
>>>

GAINS:                  1
0                
revenue  0.291692
status   0.291692

NEXT BRANCH:  revenue
epoch @0
   customer debt revenue     status  risk
3         4  Low     Low   Employee  Good
4         5  Low     Low  Employeer   Bad
7         8  Low     Low   Employee  Good
8