In [199]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer, LabelEncoder

In [217]:
from id3 import ID3
import id3

In [218]:
from C45 import C45,draw,mean

## Import the training data set

In [250]:
Columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']

In [251]:
training_data = pd.read_csv('./census/adult.data', names=Columns)
training_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income-level
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


## Import test data set

The first line of adult.test file is irrelevant information so we manually delete it.

In [256]:
Columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']

In [257]:
test_data = pd.read_csv('./census/adult.test', names=Columns)

## Preprocessing and functionalized training + testing

In [258]:
def id3_preprocess(raw_data, features, drop_feature, num_data):
    
    
    drop_feature += ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'fnlwgt', 'income-level']
    
    for feature in drop_feature:
        features.remove(feature)
    
    X = raw_data.drop(drop_feature,1)[:num_data]
    y = raw_data['income-level'][:num_data]
        
    return X.as_matrix(), y.as_matrix(), features

In [272]:
def c45_preprocess(raw_data, features, drop_feature, num_data):
        
    drop_feature += ['fnlwgt', 'income-level']
    
    X = raw_data.drop(drop_feature,1)[:num_data]
    y = raw_data['income-level'][:num_data]
    
    for feature in drop_feature:
        features.remove(feature)
    
    con_set = set()
    con_list = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
    for i in con_list:
        if i in features:
            con_set.add(features.index(i))
    
    return X.as_matrix(),y.as_matrix(),features, con_set

In [273]:
def train_drawing(raw_data, features, drop_feature, depth, num_data,version,filename):
    
    if version == 'c45':
        X_train, y_train, feature_list, con_set = c45_preprocess(raw_data, features, drop_feature, num_data)
                
        root = C45(continuous=con_set, max_depth=depth)
        root.fit(X_train, y_train)
        
        result_list = ['<=50K', '>50K']
        
        draw(root,feature_list, result_list, filename)
        
        return root
        
    elif version == 'id3':
        
        X_train, y_train, feature_list = id3_preprocess(raw_data, features, drop_feature, num_data)
        
        root = id3.ID3(max_depth=depth)
        root.fit(X_train, y_train)
        
        result_list = ['<=50K', '>50K']
        
        id3.draw(root,feature_list, result_list, filename)
        
        return root
        
    else:
        print "version can only be either c45 or id3"

In [274]:
def test(model, test_data, features, drop_features, version, num_data):
    
    if version == 'c45':
        
        X_test, y_test, features, con_set = c45_preprocess(test_data, features, drop_features, num_data)
        
        return mean([model.score(X_test, y_test) for _ in range(100)])

    
    if version == 'id3':
        
        X_test, y_test, features = id3_preprocess(test_data, features, drop_features, num_data)

        return mean([model.score(X_test, y_test) for _ in range(100)])

### Using pruning

In [321]:
drop_feature = ['fnlwgt', 'income-level']
X_train = training_data.drop(drop_feature,1).apply(LabelEncoder().fit_transform).as_matrix()
y_train = training_data.apply(LabelEncoder().fit_transform).as_matrix()[:,14]

X_test = test_data.drop(drop_feature,1).apply(LabelEncoder().fit_transform).as_matrix()
y_test = test_data.apply(LabelEncoder().fit_transform).as_matrix()[:,14]

In [322]:
X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=3)

In [323]:
clf = C45(continuous={0, 9, 10, 3, 11})
clf.fit(X_train_sub, y_train_sub)
clf.prune(X_val, y_val)

(2128, 4465)

In [324]:
print "train accuracy = %.5f" % clf.score(X_train, y_train)
print "test accuracy = %.5f" % clf.score(X_test, y_test)

train accuracy = 0.88566
test accuracy = 0.84675


In [325]:
mean([clf.score(X_test, y_test) for _ in range(100)])

0.8467538848965048

### Using random forest

In [304]:
import random_forest

In [305]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
depth = 2
num_data = 10000

X_train, y_train, feature_list, con_set = c45_preprocess(training_data, features, drop_feature, num_data)

In [306]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
depth = 2
num_data = 10000

X_test, y_test, feature_list, con_set = c45_preprocess(test_data, features, drop_feature, num_data)

In [307]:
for k in [0.02, 0.05, 0.1]:
    clf = random_forest.RandomForest(num_trees=10, continuous=con_set)
    clf.fit(X_train, y_train, k=k)

    acc = mean([clf.score(X_test, y_test) for _ in range(100)])
    print "k=%.2f got %.5f accuracy" % (k, acc)

k=0.02 got 0.81800 accuracy
k=0.05 got 0.82390 accuracy
k=0.10 got 0.82380 accuracy


In [308]:
clf.score(X_test, y_test)

0.8238

# Result from sklearn

In [194]:
from sklearn import tree

In [178]:
drop_feature = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'fnlwgt', 'income-level']

In [179]:
X_train = training_data.drop(drop_feature,1).apply(LabelEncoder().fit_transform).as_matrix()
y_train = training_data.apply(LabelEncoder().fit_transform).as_matrix()[:,14]

In [180]:
X_test = test_data.drop(drop_feature,1).apply(LabelEncoder().fit_transform).as_matrix()
y_test = test_data.apply(LabelEncoder().fit_transform).as_matrix()[:,14]

In [181]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [182]:
print clf.score(X_train, y_train)
print mean([clf.score(X_test, y_test) for _ in range(100)])

0.865636804766
0.815429027701


In [251]:
training_data = pd.read_csv('./census/adult.data', names=Columns)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income-level
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [257]:
test_data = pd.read_csv('./census/adult.test', names=Columns)

## Implementing ID3

### obtain the model with depth equals 2

In [277]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
depth = 2
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version='id3', filename = './img/id3_census_depth_2.png')

In [278]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
num_data = len(test_data)

test(root, test_data, features, drop_feature,version='id3', num_data = num_data)

0.8227381610466168

### depth equals 3

In [279]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
depth = 3
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version='id3', filename = './img/id3_census_depth_3.png')

In [280]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
num_data = len(test_data)

test(root, test_data, features, drop_feature,version='id3', num_data = num_data)

0.8231066887783297

### depth equals 4

In [281]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
depth = 4
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version='id3', filename = './img/id3_census_depth_4.png')

In [282]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
num_data = len(test_data)

test(root, test_data, features, drop_feature,version='id3', num_data = num_data)

0.8181315644002197

### depth equals 5

In [283]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
depth = 5
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version='id3', filename = './img/id3_census_depth_5.png')

In [284]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
num_data = len(test_data)

test(root, test_data, features, drop_feature,version='id3', num_data = num_data)

0.8170874024937034

## Implementing C4.5

### model with depth equals 2

In [285]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
depth = 2
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version = 'c45', filename = './img/c45_census_depth_2.png')

In [286]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
num_data = len(test_data)

print test(root, test_data, features, drop_feature, version='c45', num_data = num_data)

0.829494502795


### depth equals 3

In [287]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
depth = 3
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version = 'c45', filename = './img/c45_census_depth_3.png')

In [288]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
num_data = len(test_data)

test(root, test_data, features, drop_feature, version='c45', num_data = num_data)

0.8377863767581845

### depth equals 4

In [289]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
depth = 4
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version = 'c45', filename = './img/c45_census_depth_4.png')

In [290]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
num_data = len(test_data)

test(root, test_data, features, drop_feature, version='c45', num_data = num_data)

0.8356980529451491

### depth equals 5

In [291]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
depth = 5
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version = 'c45', filename = './img/c45_census_depth_5.png')

In [292]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = []
num_data = len(test_data)

test(root, test_data, features, drop_feature, version='c45', num_data = num_data)

0.8312757201646107

# Using subfeatures for training

## marital-status and relationship seems to be redundant, so we are going to use only one of them for training

### implementing ID3 with depth 3

##### without marital-status

In [311]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['marital-status']
depth = 3
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version='id3', filename = './img/id3_census_no_ma.png')

In [312]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['marital-status']
num_data = len(test_data)

test(root, test_data, features, drop_feature,version='id3', num_data = num_data)

0.8230452674897127

##### Without relationship

In [313]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['relationship']
depth = 3
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version='id3', filename = './img/id3_census_no_re.png')

In [314]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['relationship']
num_data = len(test_data)

test(root, test_data, features, drop_feature,version='id3', num_data = num_data)

0.8242122719734654

##### Without both

In [309]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['marital-status','relationship']
depth = 3
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version='id3', filename = './img/id3_census_neither.png')

In [310]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['marital-status','relationship']
num_data = len(test_data)

test(root, test_data, features, drop_feature,version='id3', num_data = num_data)

0.7864996007616243

### impleenting C4.5 with depth 3

##### Without marital-status

In [315]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['marital-status']
depth = 3
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version = 'c45', filename = './img/c45_census_no_ma.png')

In [316]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['marital-status']
num_data = len(test_data)

test(root, test_data, features, drop_feature, version='c45', num_data = num_data)

0.8380320619126576

##### Without relationship

In [319]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['relationship']
depth = 3
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version = 'c45', filename = './img/c45_census_no_re.png')

In [320]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['relationship']
num_data = len(test_data)

test(root, test_data, features, drop_feature, version='c45', num_data = num_data)

0.8360051593882448

##### Without both

In [296]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['marital-status', 'relationship']
depth = 3
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version = 'c45', filename = './img/c45_census_neither.png')

In [297]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['marital-status','relationship']
num_data = len(test_data)

test(root, test_data, features, drop_feature, version='c45', num_data = num_data)

0.7974940114243612

## We guess the feature "native-country" is not so important for income-level, so we remove this featrue

### implenting ID3 with depth 3

In [298]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['native-country']
depth = 3
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version='id3', filename = './img/id3_census_no_nc.png')

In [299]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['native-country']
num_data = len(test_data)

test(root, test_data, features, drop_feature, version='c45', num_data = num_data)

0.7637737239727284

### implenting C45 with depth 3

In [300]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['native-country']
depth = 3
num_data = 10000

root = train_drawing(training_data, features, drop_feature, depth, num_data, version = 'c45', filename = './img/c45_census_no_nc.png')

In [301]:
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income-level']
drop_feature = ['native-country']
num_data = len(test_data)

test(root, test_data, features, drop_feature, version='c45', num_data = num_data)

0.8369879000061411