In [72]:
import pandas as pd
import numpy as np
heart_df = pd.read_csv('framingham.csv')
heart_df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [73]:
heart_df.drop(['education'],axis=1,inplace=True)
heart_df.rename(columns={'male':'Sex_male'},inplace=True)
heart_df.isnull().sum()

Sex_male             0
age                  0
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [74]:
shape=heart_df.shape
print(shape)

(4240, 15)


In [75]:
from sklearn.preprocessing import Imputer 
  
# Using Imputer function to replace NaN 
# values with mean of that parameter value 
imputer = Imputer(missing_values = "NaN", 
                  strategy = "mean", axis = 0) 
                    
# Fitting the data, function learns the stats 
imputer = imputer.fit(heart_df['glucose'].values.reshape(-1,1)) 
  
heart_df['glucose'] = imputer.fit_transform(heart_df['glucose'].values.reshape(-1,1)) 

In [76]:
heart_df.isnull().sum()

Sex_male            0
age                 0
currentSmoker       0
cigsPerDay         29
BPMeds             53
prevalentStroke     0
prevalentHyp        0
diabetes            0
totChol            50
sysBP               0
diaBP               0
BMI                19
heartRate           1
glucose             0
TenYearCHD          0
dtype: int64

In [77]:
#droping null values as they are very less. 
heart_df  = heart_df.dropna(axis=0)

In [78]:
heart_df.head()
n = x_train.shape[1]
total = x_train.shape[0]
cols = ['age','Sex_male','cigsPerDay','totChol','sysBP','glucose']
print(n)
print(total)

5
3272


In [123]:
heart_df = heart_df[['age','Sex_male','cigsPerDay','totChol','sysBP','glucose','TenYearCHD']]
heart_df.head()

Unnamed: 0,age,Sex_male,cigsPerDay,totChol,sysBP,glucose,TenYearCHD
0,39,1,0.0,195.0,129.0,77.0,0
1,46,0,0.0,250.0,129.0,76.0,0
2,48,1,20.0,245.0,129.0,70.0,0
3,61,0,30.0,225.0,199.0,103.0,1
4,46,0,23.0,285.0,199.0,85.0,0


In [80]:
from math import log2 as log
def information_gain(df):
  Class = df.keys()[-1]
  P_type = df.loc[df[Class]== 0].count().sum()/n
  N_type = df.loc[df[Class]== 1].count().sum()/n
  Info_gain = - (P_type/(P_type+N_type))*(log(P_type/(P_type+N_type))) - (N_type/(N_type+P_type))*(log(N_type/(N_type+P_type)))
  return Info_gain

In [81]:
def find_entropy_attribute(df,col):
    values = df[col].unique()
    Class = df.keys()[-1]
    entropy = 0
    for value in values:
        P_type = df.loc[(df[Class]== 0) & (df[col] == (value))].count().sum()/n
        N_type = df.loc[(df[Class]== 1) & (df[col] == (value))].count().sum()/n
        if(P_type==0 or N_type==0):
            Info_gain = 0
        else:
            Info_gain = - (P_type/(P_type+N_type))*(log(P_type/(P_type+N_type))) - (N_type/(N_type+P_type))*(log(N_type/(N_type+P_type)))
        Prob = (df.loc[df[col] == (value)].count().sum()/n)/total
        entropy = entropy + Prob*Info_gain
    return entropy

In [82]:
def find_best(df):
  Gain = {}
  for keys in df.keys()[:-1]:
    Gain[keys]= information_gain(df)-find_entropy_attribute(df,keys)
  return max(Gain, key=Gain.get)

In [83]:
def get_subtable(df, node,value):
  return df[df[node] == value].reset_index(drop=True)

In [87]:
def buildTree(df,tree=None): 
    Class = df.keys()[-1]  
    node = find_best(df)
    attValue = np.unique(df[node])
     
    if tree is None:                    
        tree={}
        tree[node] = {}

    for value in attValue:
        subtable = get_subtable(df,node,value)
        clValue,counts = np.unique(subtable['TenYearCHD'],return_counts=True)                        
        if len(counts)==1:
            tree[node][value] = clValue[0]                                                    
        else:
            
            tree[node][value] = buildTree(subtable)                    
    return tree

In [88]:
def predict_outcome(inst,tree):
    for nodes in tree.keys():        
        value = inst[nodes]
        tree = tree[nodes][value]
        prediction = 0
        if type(tree) is dict:
            prediction = predict_outcome(inst, tree)
        else:
            prediction = tree
            break                                
    return prediction

In [109]:
import pprint
tree = buildTree(heart_df)
pprint.pprint(tree)

{'sysBP': {83.5: {'age': {35: 1, 58: 0}},
           85.0: 0,
           85.5: 1,
           90.0: 0,
           92.0: 0,
           92.5: {'age': {34: 0, 63: 1}},
           93.0: 0,
           93.5: 0,
           94.0: 0,
           95.0: 0,
           95.5: 0,
           96.0: 0,
           96.5: 0,
           97.0: 0,
           97.5: {'age': {39: 0, 46: 1, 50: 0}},
           98.0: {'totChol': {149.0: 1,
                              160.0: 1,
                              172.0: 0,
                              214.0: 0,
                              215.0: 0,
                              220.0: 0,
                              234.0: 1,
                              236.0: 0,
                              244.0: 1,
                              250.0: 0,
                              260.0: 0,
                              270.0: 0}},
           98.5: 0,
           99.0: 0,
           99.5: 0,
           100.0: {'age': {35: 0,
                           36: 0,
                 

                               157.0: 0,
                               170.0: 0,
                               173.0: 0,
                               174.0: 0,
                               175.0: 0,
                               188.0: 0,
                               189.0: 0,
                               192.0: 0,
                               193.0: 0,
                               203.0: 0,
                               206.0: 0,
                               207.0: 0,
                               209.0: {'age': {40: 1, 42: 0}},
                               211.0: {'age': {44: 0, 48: 1}},
                               212.0: 0,
                               213.0: 0,
                               214.0: 0,
                               216.0: 0,
                               217.0: 0,
                               218.0: 0,
                               219.0: 0,
                               220.0: 0,
                               221.0: 0,
             

                           57: 0,
                           60: 0,
                           61: 0,
                           62: 0}},
           174.0: {'age': {44: 0, 46: 0, 48: 0, 49: 0, 55: 0, 56: 1, 57: 0}},
           174.5: {'age': {53: 1, 61: 0}},
           175.0: {'totChol': {150.0: 0,
                               207.0: 1,
                               208.0: 0,
                               221.0: 0,
                               232.0: 1,
                               238.0: 0,
                               262.0: 0,
                               270.0: 0,
                               282.0: 0,
                               312.0: 1,
                               350.0: 0,
                               380.0: 1}},
           175.5: 0,
           176.0: {'totChol': {194.0: 1,
                               195.0: 0,
                               198.0: 0,
                               214.0: 0,
                               246.0: 0,
                     

In [114]:
test = {'age' : 39 ,'Sex_male' : 1 ,'cigsPerDay' : 0.0 ,'totChol' : 195.0 ,'sysBP' : 106.0 ,'glucose' :7.0}
test_data = pd.Series(test)
test_prediction = predict_outcome(test_data,tree)
print(test_prediction)
print(heart_df['TenYearCHD'][0])

0
0


In [124]:
heart_df.describe()

Unnamed: 0,age,Sex_male,cigsPerDay,totChol,sysBP,glucose,TenYearCHD
count,4090.0,4090.0,4090.0,4090.0,4090.0,4090.0,4090.0
mean,49.499511,0.434719,8.995355,236.659658,132.474328,81.886963,0.149389
std,8.542193,0.495781,11.913388,44.467756,22.097728,22.870846,0.356515
min,32.0,0.0,0.0,113.0,83.5,40.0,0.0
25%,42.0,0.0,0.0,206.0,117.0,72.0,0.0
50%,49.0,0.0,0.0,234.0,128.25,79.0,0.0
75%,56.0,1.0,20.0,263.0,144.0,85.0,0.0
max,70.0,1.0,70.0,696.0,295.0,394.0,1.0


In [120]:
import pprint
tree = buildTree(heart_df)
pprint.pprint(tree)

{'sysBP': {83.5: {'age': {35: 1, 58: 0}},
           85.0: 0,
           85.5: 1,
           90.0: 0,
           92.0: 0,
           92.5: {'age': {34: 0, 63: 1}},
           93.0: 0,
           93.5: 0,
           94.0: 0,
           95.0: 0,
           95.5: 0,
           96.0: 0,
           96.5: 0,
           97.0: 0,
           97.5: {'age': {39: 0, 46: 1, 50: 0}},
           98.0: {'totChol': {149.0: 1,
                              160.0: 1,
                              172.0: 0,
                              214.0: 0,
                              215.0: 0,
                              220.0: 0,
                              234.0: 1,
                              236.0: 0,
                              244.0: 1,
                              250.0: 0,
                              260.0: 0,
                              270.0: 0}},
           98.5: 0,
           99.0: 0,
           99.5: 0,
           100.0: {'age': {35: 0,
                           36: 0,
                 

                           47: 0,
                           48: 0,
                           50: 0,
                           52: 0,
                           53: 1,
                           54: {'cigsPerDay': {0.0: 0, 5.0: 1}},
                           55: 0,
                           56: 0,
                           58: 0,
                           59: 1,
                           62: 0,
                           66: 1}},
           133.0: {'totChol': {185.0: 0,
                               187.0: 0,
                               189.0: 1,
                               190.0: 0,
                               192.0: 0,
                               194.0: {'age': {45: 1, 57: 0}},
                               196.0: 0,
                               200.0: 0,
                               204.0: 0,
                               205.0: 0,
                               207.0: 0,
                               210.0: 0,
                               213.0: 0,
    