In [38]:
import pandas as pd

__ = {
    "outlook": [
        "sunny", "sunny", "overcast", "rainy", "rainy", "rainy", "overcast",
        "sunny", "sunny", "rainy", "sunny", "overcast", "overcast", "rainy"
    ],
    "temperature": [
        "hot", "hot", "hot", "mild", "cool", "cool", "cool",
        "mild", "cool", "mild", "mild", "mild", "hot", "mild"
    ],
    "humidity": [
        "high", "high", "high", "high", "normal", "normal", "normal",
        "high", "normal", "normal", "normal", "high", "high", "high"
    ],
    "wind": [
        "weak", "strong", "weak", "weak", "weak", "strong", "strong",
        "weak", "weak", "weak", "strong", "strong", "weak", "strong"
    ],
    "play": [
        "no", "no", "yes", "yes", "yes", "no", "yes",
        "no", "yes", "yes", "yes", "yes", "yes", "no"
    ]
}

df = pd.DataFrame(__)
class_att = df[df.columns[-1]]
classes: list[str] = class_att.unique().tolist()

# df, classes, df.columns, class_att.name in df.columns, classes.index("yes")
class_att

0      no
1      no
2     yes
3     yes
4     yes
5      no
6     yes
7      no
8     yes
9     yes
10    yes
11    yes
12    yes
13     no
Name: play, dtype: object

In [39]:
def data_as_dict(df: pd.DataFrame):
    res: dict[str, dict[str, list[int]]] = {}
    class_att = df[df.columns[-1]]
    classes: list[str] = class_att.unique().tolist()
    df = df.drop(columns=df.columns[-1])

    for attcol in df.columns:
        res[attcol] = {}
        for attval, classval in zip(df[attcol], class_att):
            if attval not in res[attcol]:
                res[attcol][attval] = [0] * len(classes)
            res[attcol][attval][classes.index(classval)] += 1
    return res

data_as_dict(df)

{'outlook': {'sunny': [3, 2], 'overcast': [0, 4], 'rainy': [2, 3]},
 'temperature': {'hot': [2, 2], 'mild': [2, 4], 'cool': [1, 3]},
 'humidity': {'high': [4, 4], 'normal': [1, 5]},
 'wind': {'weak': [2, 6], 'strong': [3, 3]}}

In [40]:
def gini_split(att: dict[str, list[int]], total: int):
    gini = 0.0

    for _, counts in att.items():
        subtotal = sum(counts)
        if subtotal == 0:
            continue

        score = 1.0
        for c in counts:
            p = c / subtotal
            score -= p * p

        gini += (subtotal * 1.0 / total) * score
    print(gini)
    return gini

def best_split(df: pd.DataFrame):
    data = data_as_dict(df)
    total = len(df)
    best_att = None
    best_gini = float("inf")

    for att, val in data.items():
        gini = gini_split(val, total)
        if gini < best_gini:
            best_gini = gini
            best_att = att
    return best_att, best_gini

In [41]:
root_att, root_gini = best_split(df)
root_att, root_gini

0.34285714285714286
0.44047619047619047
0.4047619047619047
0.42857142857142855


('outlook', 0.34285714285714286)