In [4]:
# Brian Garten

import pandas as pd
from math import log2

# functions
# calculates the information gain (IG) of a certain feature
def infogain_calc(data, options, parent, parent_entropy):
    # dataframe setup
    datafr = pd.DataFrame(columns=["data","num"])
    datafr["data"] = data
    datafr["num"] = parent
    
    # comparison list setup
    lst = []
    for i in range(options):
        lst.append([0,0,0])
        
    # counting frequencies
    for index, values in datafr.iterrows():
        lst[int(values[0])][int(values[1])] += 1

    # calculating weighted avg entropy
    avg_weighted_entropy = 0
    for obj in lst:
        indv_entropy = 0
        for severity in obj:
            indv_entropy += (-severity/sum(obj) * log2(severity/sum(obj)))
        avg_weighted_entropy += (sum(obj)/len(parent)) * indv_entropy

    # calculate infogain
    infogain = parent_entropy - avg_weighted_entropy
    
    return infogain

# takes in data and categorizes it based on the bounds given
def categorize_data(data, *args):
    lst = []
    for index, value in data.iteritems():
        for i in range(len(args)):
            if (value <= args[i]):
                lst.append(i)
                break
            elif (value > args[-1]):
                lst.append(len(args))
                break

    return pd.Series(lst)

In [5]:
# read in data
df = pd.read_excel("ClevelandData.xlsx")

# cleaning data
for column in df.columns:
    df[column] = pd.to_numeric(df[column],errors='coerce')
df = df.dropna().reset_index()

# calculate parent entropy
plst = []
parent = df[df.columns[-1]]
for severity in parent: # catagorizing by severity - 0=none, 1=mild, 2=severe
    if (severity == 0):
        plst.append(0)
    elif (1 <= severity < 3):
        plst.append(1)
    else:
        plst.append(2)

parent = pd.Series(plst)
p_freq = pd.value_counts(parent) # frequencies of values

lst = []
for value in p_freq:
    lst.append(-value/sum(p_freq) * log2(value/sum(p_freq)))

p_entropy = sum(lst)

In [6]:
# children information gain (IG)
infogain = []

# age IG
# catagorizing by age
# 0 = <=45
# 1 = 45-55
# 2 = 55-65
# 3 = >65
ages = categorize_data(df["age"], 45, 55, 65)
infogain.append(infogain_calc(ages, len(ages.value_counts()), parent, p_entropy))

# sex IG
infogain.append(infogain_calc(df["sex"], 2, parent, p_entropy))


#cp IG
# converting data to index form
cp = df["cp"]
cp = cp-1
infogain.append(infogain_calc(cp, 4, parent, p_entropy))


# trestbps IG
# catagorizing
# 0 = <=115
# 1 = 115-135
# 2 = 135-150
# 3 = >150
trestbps = categorize_data(df["trestbps"], 115, 135, 150)
infogain.append(infogain_calc(trestbps, len(trestbps.value_counts()), parent, p_entropy))


# chol IG
# catagorizing
# 0 = <=230
# 1 = 230-280
# 2 = 280-320
# 3 = >320
chols = categorize_data(df["chol"], 240, 280, 320)
infogain.append(infogain_calc(chols, len(chols.value_counts()), parent, p_entropy))


# fbs IG
infogain.append(infogain_calc(df["fbs"], 2, parent, p_entropy))


# restecg IG
infogain.append(infogain_calc(df["restecg"], 3, parent, p_entropy))


# thalach IG
# catagorizing
# 0 = <=120
# 1 = 120-140
# 2 = 140-160
# 3 = 160-180
# 4 = >180
thalachs = categorize_data(df["thalach"], 120, 140, 160, 180)
infogain.append(infogain_calc(thalachs, len(thalachs.value_counts()), parent, p_entropy))


# exang IG
infogain.append(infogain_calc(df["exang"], 2, parent, p_entropy))


# oldpeak IG
# catagorizing
# 0 = <=0.7
# 1 = 0.7-1.4
# 2 = 1.4-2.1
# 3 = >2.1
oldpeaks = categorize_data(df["oldpeak"], 0.7, 1.4, 2.1)
infogain.append(infogain_calc(oldpeaks, len(oldpeaks.value_counts()), parent, p_entropy))


# slope IG
# converting data to index form
slope = df["slope"]
slope = slope-1
infogain.append(infogain_calc(slope, 3, parent, p_entropy))


# ca IG
infogain.append(infogain_calc(df["ca"], 4, parent, p_entropy))


# thal IG
# catagorizing
# 0 = 3
# 1 = 6
# 2 = 7
thals = categorize_data(df["thal"], 3, 6)
infogain.append(infogain_calc(thals, len(thals.value_counts()), parent, p_entropy))



# printing all the children's information gain
print("Parent Entropy: ",p_entropy)
print("\nInformation Gain by Feature:")
for i in range(len(infogain)):
    print(df.columns[i+1] + ":\t\t" + str(infogain[i]))

Parent Entropy:  1.4266862232499145

Information Gain by Feature:
age:		0.07683372749733386
sex:		0.05790418685764487
cp:		0.20523511988474952
trestbps:		0.01508048096833492
chol:		0.02185187458621174
fbs:		0.0024173945881544867
restecg:		0.03462471670945888
thalach:		0.15189579986305035
exang:		0.135272044383959
oldpeak:		0.17190644041307213
slope:		0.12654409558413948
ca:		0.21364862304841403
thal:		0.22377472992848824
