## Importing libraries

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Reading data

###### since dataset does not contain header, adding header manually

In [9]:
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'educational-num','marital-status', 'occupation', 'relationship', 'race', 'gender','capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income']
data = pd.read_csv("adult.data",names=column_names)

In [10]:
data.shape

(32561, 15)

In [11]:
pd.set_option('display.max_columns', None)

In [12]:
data.head(20)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


## Handling missing values

###### in our data, none value is represented using ?, replacing ? with none value

In [13]:
data.replace(' ?',np.nan, inplace = True)

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1   workclass        30725 non-null  object
 2   fnlwgt           32561 non-null  int64 
 3   education        32561 non-null  object
 4   educational-num  32561 non-null  int64 
 5   marital-status   32561 non-null  object
 6   occupation       30718 non-null  object
 7   relationship     32561 non-null  object
 8   race             32561 non-null  object
 9   gender           32561 non-null  object
 10  capital-gain     32561 non-null  int64 
 11  capital-loss     32561 non-null  int64 
 12  hours-per-week   32561 non-null  int64 
 13  native-country   31978 non-null  object
 14  income           32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [15]:
missing_data = pd.DataFrame({'total_missing': data.isnull().sum(), 'percentage_missing': (data.isnull().sum()/32561)*100})
missing_data

Unnamed: 0,total_missing,percentage_missing
age,0,0.0
workclass,1836,5.638647
fnlwgt,0,0.0
education,0,0.0
educational-num,0,0.0
marital-status,0,0.0
occupation,1843,5.660146
relationship,0,0.0
race,0,0.0
gender,0,0.0


###### analysing null values in diffrent columns

In [16]:
data[data['workclass'].isnull()].head(50)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
27,54,,180211,Some-college,10,Married-civ-spouse,,Husband,Asian-Pac-Islander,Male,0,0,60,South,>50K
61,32,,293936,7th-8th,4,Married-spouse-absent,,Not-in-family,White,Male,0,0,40,,<=50K
69,25,,200681,Some-college,10,Never-married,,Own-child,White,Male,0,0,40,United-States,<=50K
77,67,,212759,10th,6,Married-civ-spouse,,Husband,White,Male,0,0,2,United-States,<=50K
106,17,,304873,10th,6,Never-married,,Own-child,White,Female,34095,0,32,United-States,<=50K
128,35,,129305,HS-grad,9,Married-civ-spouse,,Husband,White,Male,0,0,40,United-States,<=50K
149,43,,174662,Some-college,10,Divorced,,Not-in-family,White,Female,0,0,40,United-States,<=50K
154,52,,252903,HS-grad,9,Divorced,,Not-in-family,White,Male,0,0,45,United-States,>50K
160,68,,38317,1st-4th,2,Divorced,,Not-in-family,White,Female,0,0,20,United-States,<=50K
187,53,,135105,Bachelors,13,Divorced,,Not-in-family,White,Female,0,0,50,United-States,<=50K


###### both workclass and occupation columns have null value together , therefore its best to remove rows with null value becuase in a single row, we have two null values

In [17]:
data = data.dropna(axis=0, subset=['occupation','workclass'])

In [18]:
data.shape

(30718, 15)

In [19]:
data[data['native-country'].isnull()].head(50)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
14,40,Private,121772,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,0,0,40,,>50K
38,31,Private,84154,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,38,,>50K
51,18,Private,226956,HS-grad,9,Never-married,Other-service,Own-child,White,Female,0,0,30,,<=50K
93,30,Private,117747,HS-grad,9,Married-civ-spouse,Sales,Wife,Asian-Pac-Islander,Female,0,1573,35,,<=50K
245,56,Private,203580,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,35,,<=50K
249,45,Private,153141,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,,<=50K
393,34,State-gov,98101,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,7688,0,45,,>50K
453,42,Private,197583,Assoc-acdm,12,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,40,,>50K
557,31,Private,323069,HS-grad,9,Separated,Adm-clerical,Unmarried,White,Female,0,0,20,,<=50K
712,36,Private,271767,Bachelors,13,Separated,Prof-specialty,Not-in-family,White,Male,0,0,40,,<=50K


###### deleting rows with null value in native country because we have very less null values in native country column, deleting these will not effect  our prediction much

In [20]:
data = data.dropna(axis=0, subset=['native-country'])

In [21]:
data.shape

(30162, 15)

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30162 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              30162 non-null  int64 
 1   workclass        30162 non-null  object
 2   fnlwgt           30162 non-null  int64 
 3   education        30162 non-null  object
 4   educational-num  30162 non-null  int64 
 5   marital-status   30162 non-null  object
 6   occupation       30162 non-null  object
 7   relationship     30162 non-null  object
 8   race             30162 non-null  object
 9   gender           30162 non-null  object
 10  capital-gain     30162 non-null  int64 
 11  capital-loss     30162 non-null  int64 
 12  hours-per-week   30162 non-null  int64 
 13  native-country   30162 non-null  object
 14  income           30162 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


## Transforming data

In [23]:
numeric_columns = ['age','fnlwgt','educational-num','capital-gain','capital-loss','hours-per-week']
categorical_columns = data.select_dtypes(include=['object', 'category']).columns

In [24]:
categorical_columns

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'gender', 'native-country', 'income'],
      dtype='object')

In [25]:
data[categorical_columns] = data[categorical_columns].astype('category')

In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30162 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   age              30162 non-null  int64   
 1   workclass        30162 non-null  category
 2   fnlwgt           30162 non-null  int64   
 3   education        30162 non-null  category
 4   educational-num  30162 non-null  int64   
 5   marital-status   30162 non-null  category
 6   occupation       30162 non-null  category
 7   relationship     30162 non-null  category
 8   race             30162 non-null  category
 9   gender           30162 non-null  category
 10  capital-gain     30162 non-null  int64   
 11  capital-loss     30162 non-null  int64   
 12  hours-per-week   30162 non-null  int64   
 13  native-country   30162 non-null  category
 14  income           30162 non-null  category
dtypes: category(9), int64(6)
memory usage: 1.9 MB


In [27]:
for i in categorical_columns:
    data[i]=data[i].cat.codes

In [28]:
data.head(20)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,5,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,4,83311,9,13,2,3,0,4,1,0,0,13,38,0
2,38,2,215646,11,9,0,5,1,4,1,0,0,40,38,0
3,53,2,234721,1,7,2,5,0,2,1,0,0,40,38,0
4,28,2,338409,9,13,2,9,5,2,0,0,0,40,4,0
5,37,2,284582,12,14,2,3,5,4,0,0,0,40,38,0
6,49,2,160187,6,5,3,7,1,2,0,0,0,16,22,0
7,52,4,209642,11,9,2,3,0,4,1,0,0,45,38,1
8,31,2,45781,12,14,4,9,1,4,0,14084,0,50,38,1
9,42,2,159449,9,13,2,3,0,4,1,5178,0,40,38,1


###### dropping fnlwgt, capital gain and capital-loss to simplify decision tree prediction

In [29]:
data=data.drop(['fnlwgt','capital-gain','capital-loss'],axis = 1)

In [30]:
data.head()

Unnamed: 0,age,workclass,education,educational-num,marital-status,occupation,relationship,race,gender,hours-per-week,native-country,income
0,39,5,9,13,4,0,1,4,1,40,38,0
1,50,4,9,13,2,3,0,4,1,13,38,0
2,38,2,11,9,0,5,1,4,1,40,38,0
3,53,2,1,7,2,5,0,2,1,40,38,0
4,28,2,9,13,2,9,5,2,0,40,4,0


## ID3 Algorithm

In [31]:
# function to find entropy of feature
def find_entropy(feature):
    uniq_elements, elements_count = np.unique(feature, return_counts = True)
    total = np.sum(elements_count)
    entropy = 0.0
    for i in range(len(elements_count)):
        log_value = 0 if uniq_elements[i]==0 else np.log2(uniq_elements[i]/total)
        entropy = entropy + (-((uniq_elements[i]/total)*(log_value)))
    return entropy

In [32]:
# funtion to calculate information gain
# informatino gain = total entropy - weighted entropy
def find_info_gain(data, split_attribute_name, target="income"):
    total_entropy = find_entropy(data[target])
    uniq_elements, elements_count = np.unique(data[split_attribute_name], return_counts = True)
    total = np.sum(elements_count)
    weighted_entropy = 0.0
    for i in range(len(elements_count)):
        weighted_entropy += (uniq_elements[i]/total)*find_entropy(data[data[split_attribute_name] == uniq_elements[i]][target])
    information_gain = total_entropy - weighted_entropy
    return information_gain

In [33]:
find_info_gain(data,'age')

-0.009760536073550487

In [34]:
def ID3_Algo(data, original_data, features, target="income",parent_node_class=None):
#     return mode of target column if we get fully pure node
    if len(np.unique(data[target]))<=1:
        return np.unique(data[target])[0]
    
#     return mode of database if there is no more data
    elif data.empty:
        return original_data[target].mode()[0]
    
#     return class of parent node if there are no more features left
    if len(features)==0:
        return parent_node_class
    
    else:
        parent_node_class = original_data[target].mode()[0]
#         finding info gain of all features
        features_info_gain = [find_info_gain(data,i,target) for i in features]
#     finding feature with max info gain
        best_feature_index = np.argmax(features_info_gain)
        best_feature = features[best_feature_index]
        
#         initialising the tree
        tree= {best_feature:{}}
        
#         removing used features from avaialable features
        features = np.delete(features, best_feature_index)
        
        uniq_values = np.unique(data[best_feature])
        
#         going deep to find the conclusion or prediction
        for value in uniq_values:
#         finding data where best feature value is equal to current value
            sub_data = data[data[best_feature]==value]
#     recurring for each subdataa
            subtree = ID3_Algo(sub_data,original_data,features,target,parent_node_class)
#     assigning subtree as a child of parent node
            tree[best_feature][value]=subtree
        
    return tree
        

In [35]:
train=data.sample(frac=0.8,random_state=100) #random state is a seed value
test=data.drop(train.index).sample(frac=1.0)
train_cols = data.columns.drop('income')
ID3_Algo(train,train,train_cols,'income')

{'gender': {0: {'relationship': {0: 0,
    1: {'marital-status': {0: {'race': {0: {'education': {0: 0,
          8: 0,
          11: 0,
          12: 1,
          15: 0}},
        1: {'workclass': {2: {'occupation': {0: 0,
            2: 0,
            3: {'age': {30: 0, 37: 1}},
            6: 0,
            11: 0}},
          4: 1,
          5: 0}},
        2: {'workclass': {0: {'age': {40: 0,
            44: 0,
            51: 0,
            56: 0,
            58: 1,
            64: 0}},
          1: {'age': {32: 0, 41: 0, 47: 0, 50: 0, 53: 1, 57: 0, 62: 0, 67: 0}},
          2: {'occupation': {0: {'age': {29: 0,
              33: 0,
              37: 0,
              40: 0,
              42: 0,
              47: 0,
              53: 0,
              58: 1,
              59: 0}},
            2: 0,
            3: {'age': {41: 0, 43: 0, 49: 1, 53: 0, 54: 1}},
            5: 0,
            6: 0,
            7: 0,
            8: 0,
            9: 0,
            10: 0,
            12: 0,