In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.model_selection import train_test_split

In [2]:
import sys
sys.path.append('./inc/')
import ID3

In [3]:
from C45 import C45,draw

## Import the training data set

In [4]:
Columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']

In [5]:
training_data = pd.read_csv('./census data/adult.data', names=Columns)
training_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income-level
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


### To check if there is any data is missing

In [64]:
training_data.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income-level         0
dtype: int64

## Import test data set

In [7]:
Columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']

In [8]:
test_data = pd.read_csv('./census data/adult.test', names=Columns)

In [66]:
test_data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income-level      0
dtype: int64

## Preprocessing and functionalized training + testing

In [9]:
def id3_preprocess(raw_data, features, drop_feature, num_data):
    
    
    drop_feature += ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'fnlwgt', 'income-level']
    
    for feature in drop_feature:
        features.remove(feature)
    
    X = raw_data.drop(drop_feature,1)[:num_data]
    y = raw_data['income-level'][:num_data]
    
    return X.as_matrix(), y.as_matrix(), features

In [10]:
def c45_preprocess(raw_data, features, drop_feature, num_data):
        
    drop_feature += ['fnlwgt', 'income-level']
    
    X = raw_data.drop(drop_feature,1)[:num_data]
    y = raw_data['income-level'][:num_data]
    
    #print X.head()
    #print y
    
    for feature in drop_feature:
        features.remove(feature)
    
    con_set = set()
    con_list = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
    for i in con_list:
        if i in features:
            #print i
            con_set.add(features.index(i))
            #print features.index(i)
    
    #print con_set
    return X.as_matrix(),y.as_matrix(),features, con_set

In [11]:
def train_drawing(raw_data, features, drop_feature, depth, num_data,version,filename):
    
    if version == 'c45':
        X_train, y_train, feature_list, con_set = c45_preprocess(raw_data, features, drop_feature, num_data)
                
        root = C45(continuous=con_set, max_depth=depth)
        root.fit(X_train, y_train)
        
        result_list = ['<=50K', '>50K']
        
        draw(root,feature_list, result_list, filename)
        
        return root
        
    elif version == 'id3':
        
        X_train, y_train, feature_list = id3_preprocess(raw_data, features, drop_feature, num_data)
        
        root = ID3.ID3(max_depth=depth)
        root.fit(X_train, y_train)
        
        result_list = ['<=50K', '>50K']
        
        ID3.draw(root,feature_list, result_list, filename)
        
        return root
        
    else:
        print "version can only be either c45 or id3"

In [12]:
def test(model, test_data, features, drop_features, version, num_data):
    
    if version == 'c45':
        
        X_test, y_test, features, con_set = c45_preprocess(test_data, features, drop_features, num_data)
        
        return root.score(X_test, y_test)
    
    if version == 'id3':
        
        X_test, y_test, features = id3_preprocess(test_data, features, drop_features, num_data)

        return root.score(X_test, y_test)

# Result from sklearn

In [61]:
from sklearn import tree

In [63]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
num_data = 10000

clf = tree.DecisionTreeClassifier()

X_test, y_test, feature_list = id3_preprocess(test_data, features, drop_feature, num_data)

print type(X_test),type(y_test)

clf.fit(X_test, y_test)

<type 'numpy.ndarray'> <type 'numpy.ndarray'>


ValueError: could not convert string to float: United-States

In [None]:
clf.score(X_test, y_test)

## Implementing ID3

### obtain the model with depth equals 2

In [13]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
depth = 2
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version='id3', filename = './id3_census_training.png')

In [14]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
num_data = len(test_data)

test(root, test_data, features, drop_feature,version='id3', num_data = num_data)

0.8227381610466188

### depth equals 3

In [18]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
depth = 3
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version='id3', filename = './id3_census_training.png')

In [19]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
num_data = len(test_data)

test(root, test_data, features, drop_feature,version='id3', num_data = num_data)

0.8232909526441865

### depth equals 4

In [22]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
depth = 4
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version='id3', filename = './id3_census_training.png')

In [23]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
num_data = len(test_data)

test(root, test_data, features, drop_feature,version='id3', num_data = num_data)

0.8200356243473989

## Implementing C4.5

### model with depth equals 2

In [15]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
depth = 2
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version = 'c45', filename = './c45_census_training.png')

In [16]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
num_data = len(test_data)

test(root, test_data, features, drop_feature, version='c45', num_data = num_data)

0.8270376512499232

### depth equals 3

In [20]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
depth = 3
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version = 'c45', filename = './c45_census_training.png')

In [21]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
num_data = len(test_data)

test(root, test_data, features, drop_feature, version='c45', num_data = num_data)

0.8328726736686936

### depth equals 4

In [24]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
depth = 4
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version = 'c45', filename = './c45_census_training.png')

In [25]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
num_data = len(test_data)

test(root, test_data, features, drop_feature, version='c45', num_data = num_data)

0.8313985627418463

### depth equals 5

In [26]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
depth = 5
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version = 'c45', filename = './c45_census_training.png')

In [27]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
num_data = len(test_data)

test(root, test_data, features, drop_feature, version='c45', num_data = num_data)

0.8322584607825072

# Using subfeatures for training

## education and occupation are stronly related, so we are going to use only one of them.

### implementing ID3 with depth 3

##### without education

In [30]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['education']
depth = 3
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version='id3', filename = './id3_census_training.png')

In [31]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['education']
num_data = len(test_data)

test(root, test_data, features, drop_feature,version='id3', num_data = num_data)

0.8208955223880597

##### Without occupation

In [32]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['occupation']
depth = 3
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version='id3', filename = './id3_census_training.png')

In [33]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['occupation']
num_data = len(test_data)

test(root, test_data, features, drop_feature,version='id3', num_data = num_data)

0.8201584669246361

### impleenting C4.5 with depth 3

##### Without education

In [35]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['education']
depth = 3
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version = 'c45', filename = './c45_census_training.png')

In [36]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['education']
num_data = len(test_data)

test(root, test_data, features, drop_feature, version='c45', num_data = num_data)

0.8340396781524476

##### Without occupation

In [37]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['occupation']
depth = 3
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version = 'c45', filename = './c45_census_training.png')

In [38]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['occupation']
num_data = len(test_data)

test(root, test_data, features, drop_feature, version='c45', num_data = num_data)

0.8387691173760825

## We guess the feature "native-country" is not so important for income-level, so we remove this featrue

### implenting ID3 with depth 3

In [50]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['native-country']
depth = 3
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version='id3', filename = './id3_census_training.png')

In [51]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['native-country']
num_data = len(test_data)

test(root, test_data, features, drop_feature, version='c45', num_data = num_data)

0.7637737239727289

### implenting C45 with depth 3

In [52]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['native-country']
depth = 3
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version = 'c45', filename = './c45_census_training.png')

In [53]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['native-country']
num_data = len(test_data)

test(root, test_data, features, drop_feature, version='c45', num_data = num_data)

0.8345310484613967

### Using merged features for training

#### under high school
11th, 9th, 7th-8th, 12th, 1st-4th, 10th, 5th-6th are merged together.
#### working for gov
Federal-gov, Local-gov, State-gov are merged together.
