In [96]:
import numpy as np
import pandas as pd

In [97]:
data = {
    'Stream': ['false', 'true', 'true', 'false', 'false', 'true', 'true'],
    'Slope': ['steep', 'moderate', 'steep', 'steep', 'flat', 'steep', 'steep'],
    'Elevation': ['high', 'low', 'medium', 'medium', 'high', 'highest', 'high'],
    'Vegetation': ['chapparal', 'riparian', 'riparian', 'chapparal', 'conifer', 'conifer', 'chapparal']
}

df = pd.DataFrame(data)

In [98]:
X, y = df.iloc[:, :-1], df.iloc[:, -1]
X['Stream']

0    false
1     true
2     true
3    false
4    false
5     true
6     true
Name: Stream, dtype: object

In [107]:
data = {
    "Elevation": [300, 1200, 1500, 3000, 3900, 4450, 5000],
    'Vegetation': ['riparian', 'chapparal', 'riparian', 'chapparal', 'chapparal', 'conifer', 'conifer']
}

df = pd.DataFrame(data)

d ="Elevation"
df.sort_values(by=[d])
print(df)
X, y = df.iloc[:, :-1], df.iloc[:, -1]

def entropy(X, y):
    """Measures the amount of uncertainty/impurity/heterogeneity in (X, y)"""
    proba = lambda t: len(X.loc[y==t]) / len(X)
    return -sum([proba(t) * np.log2(proba(t)) for t in y.unique()])

def rem(X, y, d):
    weight = lambda t: len(X.loc[X[d]==t]) / len(X)
    return sum([weight(t) * entropy(X.loc[X[d]==t], y.loc[X[d]==t]) for t in X[d].unique()])

def information_gain(X, y, d):
    print(f"{d} = {entropy(X, y):.3f} - {rem(X, y, d):.3f} = {entropy(X, y) - rem(X, y, d):.3f}") 
    return entropy(X, y) - rem(X, y, d)

thresholds = []
prev_target = y.values[0]
for i, target in enumerate(y.values):
    if target != prev_target:
        print(X[d].iloc[i], X[d].iloc[i-1])
        thresholds.append(np.average([X[d].iloc[i], X[d].iloc[i-1]]))
    prev_target = target

print(thresholds)

gains = []
for t in thresholds:
    binned_feature = pd.cut(X[d], bins=[0, t, X[d].max()], labels=['A', 'B'])
    D = X.copy()
    D[d] = binned_feature.values
    gains.append(information_gain(D, y, d))

best_split = thresholds[np.argmax(gains)]
best_split

D = X.copy()
binned_feature = pd.cut(X[d], bins=[0, best_split, X[d].max()], labels=['A', 'B'])
D[d] = binned_feature.values
D

   Elevation Vegetation
0        300   riparian
1       1200  chapparal
2       1500   riparian
3       3000  chapparal
4       3900  chapparal
5       4450    conifer
6       5000    conifer
1200 300
1500 1200
3000 1500
4450 3900
[750.0, 1350.0, 2250.0, 4175.0]
Elevation = 1.557 - 1.251 = 0.306
Elevation = 1.557 - 1.373 = 0.184
Elevation = 1.557 - 0.965 = 0.592
Elevation = 1.557 - 0.694 = 0.863


Unnamed: 0,Elevation
0,A
1,A
2,A
3,A
4,A
5,B
6,B


In [None]:
{k: X[k].unique() for k in X.columns}

{'Elevation': array([ 300, 1200, 1500, 3000, 3900, 4450, 5000])}

In [None]:
# Base Case 1
a = y.loc[X['Elevation']=='low']
len(a.unique()) == 1

False

In [None]:
# Base Case 2
data = {
    'A':[0,0],
    'B':[1,1]
}

a = pd.DataFrame(data)
print(a)

print(all((a[d] == a[d].iloc[0]).all() for d in a.columns))

data = {
    'A':[0,1],
    'B':[0,1]
}

a = pd.DataFrame(data)
print(a)
all((a[d] == a[d].iloc[0]).all() for d in a.columns)

   A  B
0  0  1
1  0  1
True
   A  B
0  0  0
1  1  1


False

In [None]:
# Base Case 3
a = pd.DataFrame({})
a.empty

True