# Quiz for Maximizing Information Gain

For the following quiz, consider the data found in [this file](data.csv), consisting of twenty-four made-up insects measured on their length and color.

Which of the following splitting criteria provides the most information gain for discriminating Mobugs from Lobugs?

* [ ] Color = Brown
* [ ] Color = Blue
* [ ] Color = Green
* [x] Length < 17.0mm
* [ ] Length < 20.0mm

In [1]:
import pandas as pd

data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,Species,Color,Length (mm)
0,Mobug,Brown,11.6
1,Mobug,Blue,16.3
2,Lobug,Blue,15.1
3,Lobug,Green,23.7
4,Lobug,Blue,18.4


In [2]:
import numpy as np

def calculate_entropy(p, total):
    return - (p / total * np.log2(p / total) + (total - p) / total * np.log2((total - p) / total))

In [3]:
total_count = data.shape[0]

# Mobug
mobug = data[data['Species'] == 'Mobug']
mobug_count = mobug.shape[0]

parent_entropy = calculate_entropy(mobug_count, total_count)

In [4]:
# Color: Brown

brown = data[data['Color'] == 'Brown']
brown_count = brown.shape[0]

brown_mobug = brown[brown['Species'] == 'Mobug']
brown_mobug_count = brown_mobug.shape[0]

non_brown = pd.concat([data, brown]).drop_duplicates(keep = False)
non_brown_count = non_brown.shape[0]

non_brown_mobug = non_brown[non_brown['Species'] == 'Mobug']
non_brown_mobug_count = non_brown_mobug.shape[0]

children_brown_entropy = (brown_count / total_count) * calculate_entropy(brown_mobug_count, brown_count) + \
                   (non_brown_count / total_count) * calculate_entropy(non_brown_mobug_count, non_brown_count)

brown_entropy = parent_entropy - children_brown_entropy

In [5]:
# Color: Blue

blue = data[data['Color'] == 'Blue']
blue_count = blue.shape[0]

blue_mobug = blue[blue['Species'] == 'Mobug']
blue_mobug_count = blue_mobug.shape[0]

non_blue = pd.concat([data, blue]).drop_duplicates(keep = False)
non_blue_count = non_blue.shape[0]

non_blue_mobug = non_blue[non_blue['Species'] == 'Mobug']
non_blue_mobug_count = non_blue_mobug.shape[0]

children_blue_entropy = (blue_count / total_count) * calculate_entropy(blue_mobug_count, blue_count) + \
                   (non_blue_count / total_count) * calculate_entropy(non_blue_mobug_count, non_blue_count)

blue_entropy = parent_entropy - children_blue_entropy

In [6]:
# Color: Green

green = data[data['Color'] == 'Green']
green_count = green.shape[0]

green_mobug = green[green['Species'] == 'Mobug']
green_mobug_count = green_mobug.shape[0]

non_green = pd.concat([data, green]).drop_duplicates(keep = False)
non_green_count = non_green.shape[0]

non_green_mobug = non_green[non_green['Species'] == 'Mobug']
non_green_mobug_count = non_green_mobug.shape[0]

children_green_entropy = (green_count / total_count) * calculate_entropy(green_mobug_count, green_count) + \
                   (non_green_count / total_count) * calculate_entropy(non_green_mobug_count, non_green_count)

green_entropy = parent_entropy - children_green_entropy

In [7]:
# Length: 17mm

small = data[data['Length (mm)'] < 17]
small_count = small.shape[0]

small_mobug = small[small['Species'] == 'Mobug']
small_mobug_count = small_mobug.shape[0]

non_small = pd.concat([data, small]).drop_duplicates(keep = False)
non_small_count = non_small.shape[0]

non_small_mobug = non_small[non_small['Species'] == 'Mobug']
non_small_mobug_count = non_small_mobug.shape[0]

children_small_entropy = (small_count / total_count) * calculate_entropy(small_mobug_count, small_count) + \
                   (non_small_count / total_count) * calculate_entropy(non_small_mobug_count, non_small_count)

small_entropy = parent_entropy - children_small_entropy

In [8]:
# Length: 20mm

big = data[data['Length (mm)'] >= 20]
big_count = big.shape[0]

big_mobug = big[big['Species'] == 'Mobug']
big_mobug_count = big_mobug.shape[0]

non_big = pd.concat([data, big]).drop_duplicates(keep = False)
non_big_count = non_big.shape[0]

non_big_mobug = non_big[non_big['Species'] == 'Mobug']
non_big_mobug_count = non_big_mobug.shape[0]

children_big_entropy = (big_count / total_count) * calculate_entropy(big_mobug_count, big_count) + \
                   (non_big_count / total_count) * calculate_entropy(non_big_mobug_count, non_big_count)

big_entropy = parent_entropy - children_big_entropy

In [9]:
print(f"""
Brown: {brown_entropy}
Blue:  {blue_entropy}
Green: {green_entropy}
Small: {small_entropy}
Big:   {big_entropy}
""")


Brown: 0.06157292259666325
Blue:  0.000589596275060833
Green: 0.042776048498108565
Small: 0.11260735516748976
Big:   0.10073322588651734

