In [None]:
import collections
from io import StringIO

import numpy as np
import pandas as pd
import pydotplus  
import seaborn as sns
from ipywidgets import Image
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier, export_graphviz, plot_tree

from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = (10, 8)

Part 1. Toy dataset “Will They? Won’t They?

In [3]:
# Create dataframe with dummy variables
def create_df(dic, feature_list):
    out = pd.DataFrame(dic)
    out = pd.concat([out, pd.get_dummies(out[feature_list])], axis=1)
    out.drop(feature_list, axis=1, inplace=True)
    return out


# Some feature values are present in train and absent in test and vice-versa.
def intersect_features(train, test):
    common_feat = list(set(train.keys()) & set(test.keys()))
    return train[common_feat], test[common_feat]

In [4]:
features = ["Looks", "Alcoholic_beverage", "Eloquence", "Money_spent"]

training data

In [5]:
#training data
df_train = {}
df_train["Looks"] = [
    "handsome",
    "handsome",
    "handsome",
    "repulsive",
    "repulsive",
    "repulsive",
    "handsome",
]
df_train["Alcoholic_beverage"] = ["yes", "yes", "no", "no", "yes", "yes", "yes"]
df_train["Eloquence"] = ["high", "low", "average", "average", "low", "high", "average"]
df_train["Money_spent"] = ["lots", "little", "lots", "little", "lots", "lots", "lots"]
df_train["Will_go"] = LabelEncoder().fit_transform(["+", "-", "+", "-", "-", "+", "+"])

df_train = create_df(df_train, features)
df_train

Unnamed: 0,Will_go,Looks_handsome,Looks_repulsive,Alcoholic_beverage_no,Alcoholic_beverage_yes,Eloquence_average,Eloquence_high,Eloquence_low,Money_spent_little,Money_spent_lots
0,0,True,False,False,True,False,True,False,False,True
1,1,True,False,False,True,False,False,True,True,False
2,0,True,False,True,False,True,False,False,False,True
3,1,False,True,True,False,True,False,False,True,False
4,1,False,True,False,True,False,False,True,False,True
5,0,False,True,False,True,False,True,False,False,True
6,0,True,False,False,True,True,False,False,False,True


test data

In [6]:
df_test = {}
df_test["Looks"] = ["handsome", "handsome", "repulsive"]
df_test["Alcoholic_beverage"] = ["no", "yes", "yes"]
df_test["Eloquence"] = ["average", "high", "average"]
df_test["Money_spent"] = ["lots", "little", "lots"]
df_test = create_df(df_test, features)
df_test

Unnamed: 0,Looks_handsome,Looks_repulsive,Alcoholic_beverage_no,Alcoholic_beverage_yes,Eloquence_average,Eloquence_high,Money_spent_little,Money_spent_lots
0,True,False,True,False,True,False,False,True
1,True,False,False,True,False,True,True,False
2,False,True,False,True,True,False,False,True


In [7]:
# Some feature values are present in train and absent in test and vice-versa.
y = df_train["Will_go"]
df_train, df_test = intersect_features(train=df_train, test=df_test)
df_train

Unnamed: 0,Alcoholic_beverage_yes,Alcoholic_beverage_no,Money_spent_lots,Looks_handsome,Looks_repulsive,Eloquence_average,Money_spent_little,Eloquence_high
0,True,False,True,True,False,False,False,True
1,True,False,False,True,False,False,True,False
2,False,True,True,True,False,True,False,False
3,False,True,False,False,True,True,True,False
4,True,False,True,False,True,False,False,False
5,True,False,True,False,True,False,False,True
6,True,False,True,True,False,True,False,False


In [8]:
df_test

Unnamed: 0,Alcoholic_beverage_yes,Alcoholic_beverage_no,Money_spent_lots,Looks_handsome,Looks_repulsive,Eloquence_average,Money_spent_little,Eloquence_high
0,False,True,True,True,False,True,False,False
1,True,False,False,True,False,False,True,True
2,True,False,True,False,True,True,False,False


Q1: What is the entropy S0 of the initial system? By system states, we mean values of the binary feature “Will_go” - 0 or 1 - two states in total.

In [None]:
import math

probs = y.value_counts(normalize=True)
S0 = -sum(p * math.log2(p) for p in probs)
S0

0.9852281360342515

Q2.Let’s split the data by the feature “Looks_handsome”. What is the entropy S1 of the left group - the one with “Looks_handsome”. What is the entropy S2 in the opposite group? What is the information gain (IG) if we consider such a split?

In [None]:
import numpy as np


def entropy(labels):
    probs = labels.value_counts(normalize=True)
    return -(probs * np.log2(probs)).sum()


# Overall entropy S0 for Will_go
y  # y must already be defined from df_train["Will_go"] before intersect_features
S0 = entropy(y)

# Split by Looks_handsome
mask_left = df_train["Looks_handsome"] == True
y_left = y[mask_left]
y_right = y[~mask_left]

S1 = entropy(y_left)   # entropy of group with Looks_handsome = True
S2 = entropy(y_right)  # entropy of group with Looks_handsome = False

S_split = (len(y_left) / len(y)) * S1 + (len(y_right) / len(y)) * S2
IG = S0 - S_split

S0, S1, S2, IG

In [None]:
y

In [None]:
def entropy(labels):
    probs = labels.value_counts(normalize=True)
    return -(probs * np.log2(probs)).sum()

# y must already be defined as your Will_go labels (from cell 8)
S0 = entropy(y)

# Split by Looks_handsome
mask_left = df_train["Looks_handsome"] == True
y_left = y[mask_left]
y_right = y[~mask_left]

S1 = entropy(y_left)   # entropy of group with Looks_handsome = True
S2 = entropy(y_right)  # entropy of group with Looks_handsome = False

S_split = (len(y_left)/len(y)) * S1 + (len(y_right)/len(y)) * S2
print(S_split)

In [None]:
IG = S0 - S_split

S0, S1, S2, IG