# Implementation ID3 decision tree from scratch
# with Mean Squared Error and k-fold validation

### Author: Sunwoo Choi

### Data
Previously cleaned data

### Data Reference
Repository URL: https://github.com/kkehoe1985/ga_data_science_final_project/blob/master/combined_data.csv

Raw data URL: https://raw.githubusercontent.com/kkehoe1985/ga_data_science_final_project/master/combined_data.csv

In [None]:
# Set random seed as 1337
import numpy as np

np.random.seed(1337)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML/cleanDf.csv')

cleanDf = df.drop(['Votes','UnemploymentRate2015','DeepPovAll','AgeTotalPop','MaleRate','FemaleRate','TurnoutRate',
                   'NomalizePopulation','NomalizeMedHHInc2014','NomalizeToalArea','NomalizePopDensity','State','County','NomalizePerCapitaInc','Democrat'], axis=1)
# I delete the  'State' and 'County' eventhough it is categorical variables
# This is because this variable makes the tree to overfit.


genderRate = []
youngOrOld = []
for idx, row in df.iterrows():
  if row['Young'] < row['Old']:
    youngOrOld.append('Old')
  else:
    youngOrOld.append('Young')
  if row['MaleRate'] > row['FemaleRate']:
    genderRate.append('Male')
  else:
    genderRate.append('Female')

cleanDf['GenderRate'] = genderRate
cleanDf['YoungOrOld'] = youngOrOld

cleanDf.drop(['Old','Young','Adult'], axis=1, inplace=True)


headers = cleanDf.columns.tolist()
features = headers.copy()
features.remove('PovertyAllAgesPct2014')
print(features)

cleanDf

['Education', 'Religion', 'EthnicMale', 'EthnicFemale', 'GenderRate', 'YoungOrOld']


Unnamed: 0,PovertyAllAgesPct2014,Education,Religion,EthnicMale,EthnicFemale,GenderRate,YoungOrOld
0,0.162,high school diploma,Other Misc,White,White,Female,Young
1,0.099,high school diploma,Catholic,White,White,Female,Young
2,0.100,college,Christian Generic,White,White,Female,Young
3,0.238,high school diploma,Catholic,White,White,Female,Young
4,0.095,college,Catholic,White,White,Female,Young
...,...,...,...,...,...,...,...
3140,0.098,college,Catholic,White,White,Male,Young
3141,0.077,bachelor or higher,Catholic,White,White,Male,Young
3142,0.100,high school diploma,Mormon,White,White,Male,Young
3143,0.126,college,Christian Generic,White,White,Male,Young


In [None]:
# shuffle & split array into training set and validation set
sampleArr = cleanDf.to_numpy()
np.random.shuffle(sampleArr)

splitIdx = (int)(len(sampleArr)*0.7)

trainSet = sampleArr[:splitIdx]
validSet = sampleArr[splitIdx:]

print(len(trainSet)+len(validSet) == len(sampleArr)) # check any missing row

trainDf = pd.DataFrame(trainSet, columns = headers)
validDf = pd.DataFrame(validSet, columns = headers)
trainDf['PovertyAllAgesPct2014']

True


0       0.174
1       0.264
2       0.135
3       0.205
4       0.178
        ...  
2196    0.195
2197     0.25
2198    0.169
2199    0.196
2200    0.099
Name: PovertyAllAgesPct2014, Length: 2201, dtype: object

In [None]:
trainDf['PovertyAllAgesPct2014'].std()

0.06431949313202583

In [None]:
# Calculate standard deviation of selected matrix
def getSD(labelCol):
  return labelCol.std()

# calculate conditional standard deviation
def getCondSD(cols, colFeature,labelVec):
  sd = 0
  cateFeatures = cols[colFeature].value_counts()
  for cateFeature, count in cateFeatures.items():
    sd += (count/len(cols[colFeature]))*getSD(cols.loc[cols[colFeature] == cateFeature][labelVec])
  return sd

# return attribute which maximize information gain
def selectAttribute(dft, colFeatures,labelVec):
  if colFeatures is []:
    return 0
  wholeEnt = getSD(dft[labelVec])
  IG = {}
  for colFeature in colFeatures:
    IG[colFeature] = wholeEnt - getCondSD(dft[[colFeature,labelVec]], colFeature,labelVec)
  return max(IG, key=IG.get)

In [None]:
def ID3(trainDf, features, labelVec):
  if trainDf[labelVec].std()/trainDf[labelVec].mean() < 0.1:
    return {('Label',labelVec) : trainDf[labelVec].mean()}
  # no more features to split
  if not features:
    vote = trainDf.value_counts()
    return {('Label',labelVec):trainDf[labelVec].mean()}
  # all labels are same, classifier is done! 

  

  tree = {}
  splitAttribute = selectAttribute(trainDf, features, labelVec)
  attributes = trainDf[splitAttribute].unique()
  for attribute in attributes:
    tree[(splitAttribute,attribute)] = {}
  for node, subTree in tree.items():
    tmpFeature = features.copy()
    tmpFeature.remove(splitAttribute)

    tree[node] = ID3(trainDf.loc[trainDf[splitAttribute]==node[1]], tmpFeature,labelVec)
    
  return tree


labelVec = 'PovertyAllAgesPct2014'

trainedTree = ID3(trainDf, features,labelVec)
trainedTree

{('EthnicFemale',
  'Black'): {('Education',
   'bachelor or higher'): {('Religion',
    'Catholic'): {('Label',
     'PovertyAllAgesPct2014'): 0.28350000000000003}, ('Religion',
    'Christian Generic'): {('GenderRate',
     'Female'): {('YoungOrOld',
      'Young'): {('EthnicMale',
       'Black'): {('Label',
        'PovertyAllAgesPct2014'): 0.17900000000000002}, ('EthnicMale',
       'White'): {('Label',
        'PovertyAllAgesPct2014'): 0.19499999999999998}}}}}, ('Education',
   'college'): {('Religion',
    'Catholic'): {('EthnicMale',
     'White'): {('GenderRate',
      'Female'): {('YoungOrOld',
       'Young'): {('Label', 'PovertyAllAgesPct2014'): 0.07200000000000001}}}},
   ('Religion',
    'Christian Generic'): {('EthnicMale',
     'Black'): {('GenderRate',
      'Female'): {('YoungOrOld',
       'Young'): {('Label',
        'PovertyAllAgesPct2014'): 0.2622222222222222}}}, ('EthnicMale',
     'White'): {('Label', 'PovertyAllAgesPct2014'): 0.2255}}}, ('Education',
   'high s

In [None]:
def predict(row,tree):
  for key, item in tree.items():
    category = key[0]
    cateFeature = key[1]
    if category == 'Label': # when predict function found key 'Label', it returns square of error
      return (row[labelVec]-item)**2
    if row[category] == cateFeature:
      return predict(row, item)
  return 0

def checkAccuracy(checkDf, tree):
  errSum = 0
  for idx, row in checkDf.iterrows():
    errSum += predict(row, tree)
  return errSum/len(checkDf) # sum of error / n : Mean square error


trainedSucRate = checkAccuracy(trainDf, trainedTree)
print("MSE of training set:",trainedSucRate)

validSucRate = checkAccuracy(validDf, trainedTree)
print("MSE of validation set:",validSucRate)



MSE of training set: 0.0027798233838869997
MSE of validation set: 0.0026717512985704782


In [None]:

def kFoldValid(cleanDf,headers, k=5):
  # shuffle & split array into training set and validation set
  foldFeatures = headers.copy()
  sampleArr = cleanDf.to_numpy()
  np.random.shuffle(sampleArr)
  validSucRate = []

  splitIdx = (int)(len(sampleArr)*(1/k)) # set split index
  prevIdx = 0 # previous index for slicing
  for count in range(k):
    validSet = sampleArr[prevIdx:splitIdx] # set valid set from previous slicing index to current slicing index
    trainSet = np.append(sampleArr[splitIdx:], sampleArr[:prevIdx], axis=0) # set training set for rest of them
    prevIdx = splitIdx
    splitIdx = splitIdx + (int)(len(sampleArr)*(1/k)) # move to next slicing index
    trainDf = pd.DataFrame(trainSet, columns = headers) # get data from dataframe
    validDf = pd.DataFrame(validSet, columns = headers)
    foldFeatures = headers.copy() # my ID3 function requires header list to check any left features
    foldFeatures.remove('PovertyAllAgesPct2014') # label vector
    labelVec = 'PovertyAllAgesPct2014'
    trainedTree = ID3(trainDf, foldFeatures,labelVec)
    validSucRate.append(checkAccuracy(validDf, trainedTree))
    print("MSE of %d set: %f" %(count+1, validSucRate[count]))

  print("The average MSE of %d-fold cross validation: %f" %(k, sum(validSucRate)/k))

foldHeaders = cleanDf.columns.tolist() 
kFoldValid(cleanDf,foldHeaders)

MSE of 1 set: 0.002644
MSE of 2 set: 0.002835
MSE of 3 set: 0.002904
MSE of 4 set: 0.002673
MSE of 5 set: 0.003043
The average MSE of 5-fold cross validation: 0.002820
