**ID3 Decision Tree Algorithm Implementation with Post Pruning**

In [None]:
# ---------------------------------
# Machine Learning Project work
# Taha Heidari
# 2021
# ID3 Decision Tree Algorithm Implementation and Analysis with Post Pruning
# Dataset: Adult-Income Dataset
# ---------------------------------
# Obejctives: 
# Task 1-a : ID3 Decision Tree Algorithm Implementation without using ready-made Python Libraries
# Task 1-b : ID3 Decision Tree and Post Pruning without using ready-made Python Libraries

**Import librarys**

In [None]:
import numpy as np
import pandas as pd
from pprint import pprint

**Import Dataset**

In [None]:
# For Google Colab
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Codes for Projects/Machin Learning/HW1/Adult/')

In [None]:
# Import
data_train=pd.read_csv('adult.train.10k.discrete', 
                       names=['50k', 'Workclass', 'Education', 'Marital-Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Native-Country'])
data_test=pd.read_csv('adult.test.10k.discrete', 
                      names=['50k', 'Workclass', 'Education', 'Marital-Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Native-Country'])

In [None]:
# Change class column position
data_train = data_train.rename(columns={'50k': 'class'})
data_test = data_test.rename(columns={'50k': 'class'})
cols = ['Workclass', 'Education', 'Marital-Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Native-Country', 'class']
data_train = data_train[cols]
data_test = data_test[cols]
# data_train['class'] = pd.factorize(data_train['class'])[0]
# data_test['class'] = pd.factorize(data_test['class'])[0]

In [None]:
data_train

In [None]:
data_test

**Tree Functions**

In [None]:
# Entropy
def entropy(target_col):
    elements,counts = np.unique(target_col,return_counts = True)
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

# Information Gain
def InfoGain(data,split_attribute_name,target_name="class"):
    total_entropy = entropy(data[target_name])    
    vals,counts= np.unique(data[split_attribute_name],return_counts=True)
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])    
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

In [None]:
def ID3(data,originaldata,features,target_attribute_name="class",parent_node_class = None):
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]    
    elif len(data)==0:
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name],return_counts=True)[1])]
    elif len(features) ==0:
        return parent_node_class
    else:
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],return_counts=True)[1])]
        item_values = [InfoGain(data,feature,target_attribute_name) for feature in features]
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        tree = {best_feature:{}}    
        features = [i for i in features if i != best_feature]        
        for value in np.unique(data[best_feature]):
            value = value
            sub_data = data.where(data[best_feature] == value).dropna()
            subtree = ID3(sub_data,originaldata,features,target_attribute_name,parent_node_class)
            tree[best_feature][value] = subtree
        return tree

In [None]:
def predict(query,tree,default = 1):
    for key in list(query.keys()):
        if key in list(tree.keys()):
            try:
                result = tree[key][query[key]] 
            except:
                return default
            result = tree[key][query[key]]
            if isinstance(result,dict):
                return predict(query,result)
            else:
                return result

In [None]:
def train_test_split(dataset, train_size):
    dataset_shuffled = dataset.sample(frac=1).reset_index(drop=True)
    training_data = dataset_shuffled.iloc[:int(data_train.shape[0]*(train_size))].reset_index(drop=True)
    testing_data = dataset_shuffled.iloc[int(data_train.shape[0]*(train_size)):].reset_index(drop=True)
    return training_data, testing_data

In [None]:
def test(data,tree):
    queries = data.iloc[:,:-1].to_dict(orient = "records")
    predicted = pd.DataFrame(columns=["predicted"]) 
    for i in range(len(data)):
        predicted.loc[i,"predicted"] = predict(queries[i],tree,1.0) 
    return (np.sum(predicted["predicted"] == data["class"])/len(data))*100

# **1-a**

In [None]:
acc_list = []
print('---------------------')
for count in range(5):
  training_data, _ = train_test_split(data_train, train_size=0.25)
  _, testing_data = train_test_split(data_test, train_size=0.0)
  tree = ID3(training_data,training_data,training_data.columns[:-1])
  acc = test(testing_data, tree)
  acc_list.append(acc)
  print('Tree #', str(count+ 1))
  print('Tree Accuracy:', str(round(acc, 2)),'%')
  print('---------------------')
print('Avrage Tree Accuracy:', str(round(sum(acc_list) / len(acc_list), 2)), '%')
print('---------------------')


---------------------
Tree # 1
Tree Accuracy: 76.32 %
---------------------
Tree # 2
Tree Accuracy: 75.85 %
---------------------
Tree # 3
Tree Accuracy: 74.68 %
---------------------
Tree # 4
Tree Accuracy: 75.08 %
---------------------
Tree # 5
Tree Accuracy: 75.36 %
---------------------
Avrage Tree Accuracy: 75.46 %
---------------------


# **1-b**

In [None]:
train_sizes = [0.25, 0.35, 0.45, 0.55, 0.65, 0.75]
ave_acc_list = []

print('---------------------')
for train_size in train_sizes:
  acc_list = []
  print('Training Size:', str(int(train_size*100)), '%')
  print('---------------------')
  for count in range(5):
    training_data, _ = train_test_split(data_train, train_size)
    _, testing_data = train_test_split(data_test, train_size=0.0)
    tree = ID3(training_data,training_data,training_data.columns[:-1])
    acc = test(testing_data, tree)
    acc_list.append(acc)
    print('Tree #', str(count + 1), 'Accuracy:', str(round(acc, 2)),'%')
  print('---------------------')
  ave_acc_list.append(round(sum(acc_list) / len(acc_list), 2))
  print('Avrage Tree Accuracy for Training Size', str(int(train_size*100)), '% :', str(round(sum(acc_list) / len(acc_list), 2)), '%')
  print('---------------------')

# Train 100%
print('Training Size:', '100', '%')
acc_list = []
for count in range(5):
  training_data, _ = train_test_split(data_train, train_size=1.0)
  _, testing_data = train_test_split(data_test, train_size=0.0)
  tree = ID3(training_data,training_data,training_data.columns[:-1])
  acc = test(testing_data, tree)
  acc_list.append(acc)
  print('Tree #', str(count + 1), 'Accuracy:', str(round(acc, 2)),'%')
print('---------------------')
ave_acc_list.append(round(sum(acc_list) / len(acc_list), 2))
print('Avrage Tree Accuracy for Training Size', str(int(train_size*100)), '% :', str(round(sum(acc_list) / len(acc_list), 2)), '%')
print('---------------------')

---------------------
Training Size: 25 %
---------------------
Tree # 1 Accuracy: 75.29 %
Tree # 2 Accuracy: 76.24 %
Tree # 3 Accuracy: 75.27 %
Tree # 4 Accuracy: 75.57 %
Tree # 5 Accuracy: 76.09 %
---------------------
Avrage Tree Accuracy for Training Size 25 % : 75.69 %
---------------------
Training Size: 35 %
---------------------
Tree # 1 Accuracy: 76.06 %
Tree # 2 Accuracy: 76.58 %
Tree # 3 Accuracy: 76.26 %
Tree # 4 Accuracy: 75.99 %
Tree # 5 Accuracy: 76.22 %
---------------------
Avrage Tree Accuracy for Training Size 35 % : 76.22 %
---------------------
Training Size: 45 %
---------------------
Tree # 1 Accuracy: 77.27 %
Tree # 2 Accuracy: 76.86 %
Tree # 3 Accuracy: 76.85 %
Tree # 4 Accuracy: 77.09 %
Tree # 5 Accuracy: 76.61 %
---------------------
Avrage Tree Accuracy for Training Size 45 % : 76.94 %
---------------------
Training Size: 55 %
---------------------
Tree # 1 Accuracy: 76.71 %
Tree # 2 Accuracy: 76.87 %
Tree # 3 Accuracy: 76.66 %
Tree # 4 Accuracy: 76.41 %
Tre