In [1]:
# Import necessary packages
import pandas as pd
import numpy as np
import pickle

from collections import Counter
import itertools
from dbfread import DBF
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.tree import _tree
from sklearn.externals import joblib
from sklearn.tree import DecisionTreeClassifier as DTC

from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn import tree
from imblearn.over_sampling import SMOTE
from tabulate import tabulate
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)

In [2]:
# Import data
idap_dbf = DBF('R:/Projects/20713A_Ohio3C/HIS/IDAP.dbf')
idap = pd.DataFrame(iter(idap_dbf))

hhtype_dbf = DBF('R:/Projects/20713A_Ohio3C/HIS/hhtype.dbf')
hhtype = pd.DataFrame(iter(hhtype_dbf))

pertype_dbf = DBF('R:/Projects/20713A_Ohio3C/HIS/PERTYPE.dbf')
pertype = pd.DataFrame(iter(pertype_dbf))

accessibility_dbf = DBF('R:/Projects/20713A_Ohio3C/Accessibility/ACCESSIBILITY_MEASURES.dbf')
accessibility = pd.DataFrame(iter(accessibility_dbf))

In [3]:
# Process IDAP Data
idap = idap[idap['MPO'] == 1]
    
genders = {1:'Male', 2:'Female'}
pertypes = {1:'FT_Worker', 2:'PT_Worker', 3:'Univ_Stud', 4:'Non_Worker', 5:'Retiree', 6:'DA_Stud', 7:'PDA_Stud', 8:'PreSch_Child'}
idaps = {1:'M', 2:'NM', 3:'H'}

idap['GENDER'] = idap.GENDER.map(genders)
idap['PERTYPE'] = idap.PERTYPE.map(pertypes)
idap['IDAP'] = idap.IDAP.map(idaps)
idap.age = idap.AGE.astype(int)

idap = idap[['SAMPN', 'PERNO', 'GENDER', 'PERTYPE', 'AGE', 'IDAP']]
idap.dropna(inplace=True)


In [4]:
# Process household type data
hhtype = hhtype[hhtype['MPO'] == 1]

incomes = {1: '$49,999 or less', 2: '$49,999 or less', 3: '$50,000 to $74,999', 4: '$75,000 or more', \
           5: '$75,000 or more', 9: 'DK/RF'}
hhtype['HINCCAT1'] = hhtype.HINCCAT1.map(incomes)
hhtype['HWORK_F'] = hhtype['HWORK_F'].fillna(0)
hhtype['HWORK_P'] = hhtype['HWORK_P'].fillna(0)
hhtype['WORKERS'] = hhtype['HWORK_F'] + hhtype['HWORK_P']
           
    
# Calculate car sufficiency
car_sufficiency = []
for ix, row in hhtype.iterrows():
    if row['HHVEH'] == 0:
        car_sufficiency.append('Zero Cars')
    elif row['HHVEH'] < row['WORKERS']:
        car_sufficiency.append('Fewer Cars than Workers')
    elif row['HHVEH'] == row['WORKERS']:
        car_sufficiency.append('Cars Equals to Workers')
    elif row['HHVEH'] > row['WORKERS']:
        car_sufficiency.append('More Cars than Workers')
        
hhtype['CAR_SUFF'] = car_sufficiency


hh_features = hhtype[['SAMPN', 'CAR_SUFF', 'HINCCAT1', 'HHTAZ']]

In [5]:
# Process person type data
pertype = pertype[pertype['MPO'] == 1]
educations = {1: 'Less than Bachelors', 2: 'Less than Bachelors', 3: 'Less than Bachelors', \
              4: 'Less than Bachelors', 5: 'Bachelors or Higher Degree', 6: 'Bachelors or Higher Degree', \
              7: 'Less than Bachelors', 99: 'DK/RF'}

numjobs = {0: 'No Job', 1: 'More than One Job', 2: 'One Job', 99: 'DK/RF'}
jobtypes = {1: 'Agriculture and Mining', 2: 'Transportation, Utilities and Warehousing', \
            3: 'Manufacturing and Wholesale Trade', 5: 'Information, Professor/Scientist, Management, Admin', \
            6: 'Education', 7: 'Finance & Real Estate', 8: 'Arts/Entertainment', 9: 'Public Admin', \
            10: 'Health', 11: 'Other', 12: 'MORPC INDUS=80', 13: 'Retail'}


pertype.EDUCA = pertype.EDUCA.map(educations)
pertype.JOBS = pertype.JOBS.map(numjobs)
pertype.JOBTYPE = pertype.JOBTYPE.map(jobtypes)
pertype.JOBTYPE = pertype.JOBTYPE.fillna('No Job Type')
per_features = pertype[['SAMPN', 'PERNO', 'EDUCA', 'JOBS', 'JOBTYPE', 'WU_DIST', 'SU_DIST']]

In [6]:
# Process accessibility data
accessibility = accessibility[accessibility['MPO'] == 1]
access_features = accessibility[['TAZ', 'ACCESS7', 'ACCESS8', 'ACCESS9']]

In [7]:
idap.reset_index(inplace=True)
idap['PERID'] = idap['SAMPN'].astype(str) + idap['PERNO'].astype(str)
per_features['PERID'] = per_features['SAMPN'].astype(str) + per_features['PERNO'].astype(str)

In [8]:
# Merge household data
features = pd.merge(idap, hh_features, on='SAMPN', how='inner')
features = pd.merge(features, per_features.drop(['SAMPN','PERNO'],axis=1), on='PERID', how='inner')
features = pd.merge(features, access_features, left_on='HHTAZ', right_on='TAZ', how='left')
features.drop(['TAZ', 'HHTAZ', 'index'], axis=1, inplace=True)

In [9]:
# Assign accessibilities
accessibility = []
for ix, row in features.iterrows():
    if row['CAR_SUFF'] == 'Zero Cars':
        accessibility.append(row['ACCESS7'] + 1)
    elif row['CAR_SUFF'] == 'Fewer Cars Than Workers':
        accessibility.append(row['ACCESS8'] + 1)
    else:
        accessibility.append(row['ACCESS9'] + 1)
features['ACCESSIBILITY'] = np.log(accessibility)
features.drop(['ACCESS7', 'ACCESS8', 'ACCESS9'], axis=1, inplace=True)

In [10]:
# Assign work or school distance based on person type
distances = []
for ix, row in features.iterrows():
    if( row['PERTYPE'] == 'FT_Worker') or (row['PERTYPE'] == 'PT_Worker'):
        distances.append(row['WU_DIST'])
    elif (row['PERTYPE'] == 'DA_Stud') or (row['PERTYPE'] == 'PDA_Stud') or (row['PERTYPE'] == 'Univ_Stud'):
        distances.append(row['SU_DIST'])
    elif row['WU_DIST'] > 0:
        distances.append(row['WU_DIST'])
    elif row['SU_DIST'] > 0:
        distances.append(row['SU_DIST'])
    else:
        distances.append(999)
        
features['DIST'] = distances
features.drop(['WU_DIST', 'SU_DIST', 'PERID'], axis=1, inplace=True)
features.set_index('SAMPN', inplace=True)

In [11]:
hh_interactions_columns = ['Retiree_NM', 'Retiree_M', 'Retiree_H', 'FT_Worker_NM', 'FT_Worker_M', 'FT_Worker_H', \
                           'PT_Worker_NM', 'PT_Worker_M', 'PT_Worker_H', 'DA_Stud_NM', 'DA_Stud_M', 'DA_Stud_H', \
                           'PreSch_Child_NM', 'PreSch_Child_M', 'PreSch_Child_H', 'PDA_Stud_NM', 'PDA_Stud_M', 'PDA_Stud_H', \
                           'Non_Worker_NM', 'Non_Worker_M', 'Non_Worker_H', 'Univ_Stud_NM', 'Univ_Stud_M', 'Univ_Stud_H']

hh_interactions = pd.DataFrame(index=[list(itertools.product(features.PERTYPE.unique(), features.IDAP.unique()))])
hh_interactions.reset_index(inplace=True)
hh_interactions.columns = ['PERTYPE', 'IDAP']
hh_interactions_final = pd.DataFrame()
for ix, row in features.iterrows():
    df = pd.DataFrame(features.loc[ix])
    if len(df.T) == 1:
        temp = [0]*24
        hh_interactions_final = hh_interactions_final.append(pd.Series(temp), ignore_index=True)
    else:
        df = df.groupby(['PERTYPE', 'IDAP']).count()
        df = df[['PERNO']]
        df.columns = ['COUNT']
        df.reset_index(inplace=True)
        hh_interactions_temp = pd.merge(hh_interactions, df, how='left', on=['PERTYPE', 'IDAP'])
        hh_interactions_temp.loc[(hh_interactions_temp.PERTYPE == row['PERTYPE']) & \
                                 (hh_interactions_temp.IDAP == row['IDAP']), 'COUNT'] = hh_interactions_temp.loc[(hh_interactions_temp.PERTYPE == row['PERTYPE']) & \
                                                                                                         (hh_interactions_temp.IDAP == row['IDAP']), 'COUNT'] - 1
        hh_interactions_final = hh_interactions_final.append(hh_interactions_temp['COUNT'].T)
hh_interactions_final.fillna(0, inplace=True)
hh_interactions_final.columns = hh_interactions_columns
features.reset_index(inplace=True)
features = pd.concat([features, hh_interactions_final], axis=1)

In [441]:
# Fix age for pre-drivng age students
features.loc[(features.PERTYPE == 'PDA_Stud') & (features.AGE > 18), 'AGE'] = 12
features.loc[(features.PERTYPE == 'PreSch_Child') & (features.AGE > 10), 'AGE'] = 5

In [12]:
# Define x, y split function
def create_x_y(left_branch, right_branch):
    X_left = left_branch.drop('IDAP', axis=1)
    Y_left = left_branch['IDAP']
    X_right = right_branch.drop('IDAP', axis=1)
    Y_right = right_branch['IDAP']
    return X_left, Y_left, X_right, Y_right

In [13]:
# Define custom splitter function
def specify_split(X, Y, feature):
    X_first_split = X[[feature]]
    dtc =  DTC(random_state=123, max_depth=1)
    dt_model = dtc.fit(X_first_split, Y)
    optimal_split = dt_model.tree_.threshold[0]
    data = pd.DataFrame(Y).join(X)
    left_branch = data[data[feature] <= optimal_split]
    right_branch = data[data[feature] > optimal_split]
    return left_branch, right_branch, dt_model

In [226]:
def tree_to_webgraph_top(tree, feature_name, outfile_name):

    tree_ = tree.tree_
#     feature_name = [
#         feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
#         for i in tree_.feature]

    f = open(outfile_name, "w+")
    f.write('digraph Tree {\n')
    f.write('node [shape=box, style="filled, rounded", color="black", fontname=helvetica] ;\n')
    f.write('edge [fontname=helvetica] ;\n')
    
    for node in range(len(tree_.feature)):
        name = feature_name[node]
        threshold = round(tree_.threshold[node], 2)
        impurity = round(tree_.impurity[node], 2)
        values = tree_.value[node][0]
        
        if np.argmax(tree_.value[node][0]) == 0:
            label = 'H'
            color = '#e5813938'
        elif np.argmax(tree_.value[node][0]) == 1:
            label = 'M'
            color = '#39e581c6'
        else:
            label = 'NM'
            color = '#8139e579'
            
        
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            f.write('{} [label="{} <= {}\\ngini = {}\\nvalue = {}\\nclass = {}", fillcolor="{}"] ;\n'.format(node, feature_name, threshold, impurity, values, label, color)) 
            f.write('{} -> {} [labeldistance=2.5, labelangle=45, headlabel="True"] ;\n'.format(node, tree_.children_left[node]))
            f.write('{} -> {} [labeldistance=2.5, labelangle=-45, headlabel="False"] ;\n'.format(node, tree_.children_right[node]))
            left_node = tree_.children_left[node]
            right_node = tree_.children_right[node]

    f.close()
    max_node = max(range(len(tree_.feature)))
    return left_node, right_node, max_node

In [329]:
def tree_to_webgraph_second(tree, feature_name, node, max_node, outfile_name, end=False):

    tree_ = tree.tree_
    
    node_list = [i + max_node+1 for i in range(len(tree_.feature))]
    node_list[0] = node
    f = open(outfile_name, "a+")
        
    for node_name, node_index in zip(node_list, range(len(tree_.feature))):
        threshold = round(tree_.threshold[node_index], 2)
        impurity = round(tree_.impurity[node_index], 2)
        values = tree_.value[node_index][0]
        
        if np.argmax(tree_.value[node_index][0]) == 0:
            label = 'H'
            color = '#e5813938'
        elif np.argmax(tree_.value[node_index][0]) == 1:
            label = 'M'
            color = '#39e581c6'
        else:
            label = 'NM'
            color = '#8139e579'
        
        if tree_.feature[node_index] != _tree.TREE_UNDEFINED:
            f.write('{} [label="{} <= {}\\ngini = {}\\nvalue = {}\\nclass = {}", fillcolor="{}"] ;\n'.format(node_name, feature_name, threshold, impurity, values, label, color)) 
            f.write('{} -> {} ;\n'.format(node_name, tree_.children_left[node_index]+max_node+1))
            f.write('{} -> {} ;\n'.format(node_name, tree_.children_right[node_index]+max_node+1))
            left_node = tree_.children_left[node_index] + max_node+1
            right_node = tree_.children_right[node_index] + max_node+1

    if end == True:
        f.write('}')
    max_node = max(node_list)
    f.close()
    return left_node, right_node, max_node

In [330]:
def tree_to_webgraph_bottom(tree, feature_names, node, max_node, outfile_name, end=False):

    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature]
    
    node_list = [i + max_node+1 for i in range(len(tree_.feature))]
    node_list[0] = node
    f = open(outfile_name, "a+")
    
    feature_name_dict = {}
    for node, feature in zip(node_list, feature_name):
        feature_name_dict[node] = feature 
        
    for node_name, node_index in zip(node_list, range(len(tree_.feature))):
        name = feature_name_dict[node_name]
        threshold = round(tree_.threshold[node_index], 2)
        impurity = round(tree_.impurity[node_index], 2)
        values = tree_.value[node_index][0]
        
        if np.argmax(tree_.value[node_index][0]) == 0:
            label = 'H'
            color = '#e5813938'
        elif np.argmax(tree_.value[node_index][0]) == 1:
            label = 'M'
            color = '#39e581c6'
        else:
            label = 'NM'
            color = '#8139e579'
        
        if tree_.feature[node_index] != _tree.TREE_UNDEFINED:
            f.write('{} [label="{} <= {}\\ngini = {}\\nvalue = {}\\nclass = {}", fillcolor="{}"] ;\n'.format(node_name, name, threshold, impurity, values, label, color)) 
            f.write('{} -> {} ;\n'.format(node_name, tree_.children_left[node_index]+max_node+1))
            f.write('{} -> {} ;\n'.format(node_name, tree_.children_right[node_index]+max_node+1))
        else:
            f.write('{} [label="gini = {}\\nvalue = {}\\nclass = {}", fillcolor="{}"] ;\n'.format(node_name, impurity, values, label, color))
    
    if end == True:
        f.write('}')
    if len(tree_.feature) == 1:
        max_node = max_node + 1
    else:
        max_node = max(node_list)
    f.close()
    return max_node

In [394]:
def specify_dtc(X, Y, split_hierarchy, max_leaf_nodes, outfile_name, num_splits):
    if num_splits == 4:
        ######################
        #### First split #####
        ######################
        left, right, dt_first = specify_split(X, Y, split_hierarchy.loc['first'][0])
        X_left, Y_left, X_right, Y_right = create_x_y(left, right)
        left_node, right_node, max_node = tree_to_webgraph_top(dt_first, split_hierarchy.loc['first'][0], outfile_name)

        ######################
        #### Second Split ####
        ######################
        # Split left branch
        left_left, left_right, dt_left = specify_split(X_left, Y_left, split_hierarchy.loc['left'][0])
        X_left_left, Y_left_left, X_left_right, Y_left_right = create_x_y(left_left, left_right)
        left_left_node, left_right_node, max_node = tree_to_webgraph_second(dt_left, split_hierarchy.loc['left'][0], left_node, max_node, outfile_name)

        # Split right branch
        right_left, right_right, dt_right = specify_split(X_right, Y_right, split_hierarchy.loc['right'][0])
        X_right_left, Y_right_left, X_right_right, Y_right_right = create_x_y(right_left, right_right)
        right_left_node, right_right_node, max_node = tree_to_webgraph_second(dt_right, split_hierarchy.loc['right'][0], right_node, max_node, outfile_name)

        ######################
        #### Third Split #####
        ######################
        ## Left Branch ##
        # Split left_left branch
        left_left_left, left_left_right, dt_left_left = specify_split(X_left_left, Y_left_left, split_hierarchy.loc['left_left'][0])
        X_left_left_left, Y_left_left_left, X_left_left_right, Y_left_left_right = create_x_y(left_left_left, left_left_right)
        left_left_left_node, left_left_right_node, max_node = tree_to_webgraph_second(dt_left_left, split_hierarchy.loc['left_left'][0], left_left_node, max_node, outfile_name)

        # Split left_right branch
        left_right_left, left_right_right, dt_left_right = specify_split(X_left_right, Y_left_right, split_hierarchy.loc['left_right'][0])
        X_left_right_left, Y_left_right_left, X_left_right_right, Y_left_right_right = create_x_y(left_right_left, left_right_right)
        left_right_left_node, left_right_right_node, max_node = tree_to_webgraph_second(dt_left_right, split_hierarchy.loc['left_right'][0], left_right_node, max_node, outfile_name)

        ## Right Branch ##
        # Split right_left branch
        right_left_left, right_left_right, dt_right_left = specify_split(X_right_left, Y_right_left, split_hierarchy.loc['right_left'][0])
        X_right_left_left, Y_right_left_left, X_right_left_right, Y_right_left_right = create_x_y(right_left_left, right_left_right)
        right_left_left_node, right_left_right_node, max_node = tree_to_webgraph_second(dt_right_left, split_hierarchy.loc['right_left'][0], right_left_node, max_node, outfile_name)

        # Split right_right branch
        right_right_left, right_right_right, dt_right_right = specify_split(X_right_right, Y_right_right, split_hierarchy.loc['right_right'][0])
        X_right_right_left, Y_right_right_left, X_right_right_right, Y_right_right_right = create_x_y(right_right_left, right_right_right)
        right_right_left_node, right_right_right_node, max_node = tree_to_webgraph_second(dt_right_right, split_hierarchy.loc['right_right'][0], right_right_node, max_node, outfile_name)

        ######################
        #### Fourth Split ####
        ######################
        ## Left_Left Branch ##
        # Split left_left_left branch
        left_left_left_left, left_left_left_right, dt_left_left_left = specify_split(X_left_left_left, Y_left_left_left, split_hierarchy.loc['left_left_left'][0])
        X_left_left_left_left, Y_left_left_left_left, X_left_left_left_right, Y_left_left_left_right = create_x_y(left_left_left_left, left_left_left_right)
        left_left_left_left_node, left_left_left_right_node, max_node = tree_to_webgraph_second(dt_left_left_left, split_hierarchy.loc['left_left_left'][0], left_left_left_node, max_node, outfile_name)

        # Split left_left_right branch
        left_left_right_left, left_left_right_right, dt_left_left_right = specify_split(X_left_left_right, Y_left_left_right, split_hierarchy.loc['left_left_right'][0])
        X_left_left_right_left, Y_left_left_right_left, X_left_left_right_right, Y_left_left_right_right = create_x_y(left_left_right_left, left_left_right_right)
        left_left_right_left_node, left_left_right_right_node, max_node = tree_to_webgraph_second(dt_left_left_right, split_hierarchy.loc['left_left_right'][0], left_left_right_node, max_node, outfile_name)

        ## Left_Right Branch ##
        # Split left_right_left branch
        left_right_left_left, left_right_left_right, dt_left_right_left = specify_split(X_left_right_left, Y_left_right_left, split_hierarchy.loc['left_right_left'][0])
        X_left_right_left_left, Y_left_right_left_left, X_left_right_left_right, Y_left_right_left_right = create_x_y(left_right_left_left, left_right_left_right)
        left_right_left_left_node, left_right_left_right_node, max_node = tree_to_webgraph_second(dt_left_right_left, split_hierarchy.loc['left_right_left'][0], left_right_left_node, max_node, outfile_name)

        # Split left_right_right branch
        left_right_right_left, left_right_right_right, dt_left_right_right = specify_split(X_left_right_right, Y_left_right_right, split_hierarchy.loc['left_right_right'][0])
        X_left_right_right_left, Y_left_right_right_left, X_left_right_right_right, Y_left_right_right_right = create_x_y(left_right_right_left, left_right_right_right)
        left_right_right_left_node, left_right_right_right_node, max_node = tree_to_webgraph_second(dt_left_right_right, split_hierarchy.loc['left_right_right'][0], left_right_right_node, max_node, outfile_name)

        ## Right_Left Branch ##
        # Split right_left_left branch
        right_left_left_left, right_left_left_right, dt_right_left_left = specify_split(X_right_left_left, Y_right_left_left, split_hierarchy.loc['right_left_left'][0])
        X_right_left_left_left, Y_right_left_left_left, X_right_left_left_right, Y_right_left_left_right = create_x_y(right_left_left_left, right_left_left_right)
        right_left_left_left_node, right_left_left_right_node, max_node = tree_to_webgraph_second(dt_right_left_left, split_hierarchy.loc['right_left_left'][0], right_left_left_node, max_node, outfile_name)

        # Split right_left_right branch
        right_left_right_left, right_left_right_right, dt_right_left_right = specify_split(X_right_left_right, Y_right_left_right, split_hierarchy.loc['right_left_right'][0])
        X_right_left_right_left, Y_right_left_right_left, X_right_left_right_right, Y_right_left_right_right = create_x_y(right_left_right_left, right_left_right_right)
        right_left_right_left_node, right_left_right_right_node, max_node = tree_to_webgraph_second(dt_right_left_right, split_hierarchy.loc['right_left_right'][0], right_left_right_node, max_node, outfile_name)

        ## Right_Right Branch ##
        # Split right_right_left branch
        right_right_left_left, right_right_left_right, dt_right_right_left = specify_split(X_right_right_left, Y_right_right_left, split_hierarchy.loc['right_right_left'][0])
        X_right_right_left_left, Y_right_right_left_left, X_right_right_left_right, Y_right_right_left_right = create_x_y(right_right_left_left, right_right_left_right)
        right_right_left_left_node, right_right_left_right_node, max_node = tree_to_webgraph_second(dt_right_right_left, split_hierarchy.loc['right_right_left'][0], right_right_left_node, max_node, outfile_name)

        # Split right_right_right branch
        right_right_right_left, right_right_right_right, dt_right_right_right = specify_split(X_right_right_right, Y_right_right_right, split_hierarchy.loc['right_right_right'][0])
        X_right_right_right_left, Y_right_right_right_left, X_right_right_right_right, Y_right_right_right_right = create_x_y(right_right_right_left, right_right_right_right)
        right_right_right_left_node, right_right_right_right_node, max_node = tree_to_webgraph_second(dt_right_right_right, split_hierarchy.loc['right_right_right'][0], right_right_right_node, max_node, outfile_name)

        ##########################################
        #### Train subsequent bottom of trees ####
        ##########################################
        dtc =  DTC(random_state=123, max_leaf_nodes=max_leaf_nodes)

        #### Train Bottom Trees ####
        ## Left Left left Branch ##
        # Train classifier left left left left
        dt_left_left_left_left = dtc.fit(X_left_left_left_left, Y_left_left_left_left)
        max_node = tree_to_webgraph_bottom(dt_left_left_left_left, X.columns, left_left_left_left_node, max_node, outfile_name)

        # Train classifier left left left right
        dt_left_left_left_right = dtc.fit(X_left_left_left_right, Y_left_left_left_right)
        max_node = tree_to_webgraph_bottom(dt_left_left_left_right, X.columns, left_left_left_right_node, max_node, outfile_name)

        ## Left Left Right Branch ##
        # Train classifier left left right left
        dt_left_left_right_left = dtc.fit(X_left_left_right_left, Y_left_left_right_left)
        max_node = tree_to_webgraph_bottom(dt_left_left_right_left, X.columns, left_left_right_left_node, max_node, outfile_name)

        # Train classifier left left right right
        dt_left_left_right_right = dtc.fit(X_left_left_right_right, Y_left_left_right_right)
        max_node = tree_to_webgraph_bottom(dt_left_left_right_right, X.columns, left_left_right_right_node, max_node, outfile_name)

        ## Left Right Left Branch ##
        # Train classifier left right left left
        dt_left_right_left_left = dtc.fit(X_left_right_left_left, Y_left_right_left_left)
        max_node = tree_to_webgraph_bottom(dt_left_right_left_left, X.columns, left_right_left_left_node, max_node, outfile_name)

        # Train classifier left right left right
        dt_left_right_left_right = dtc.fit(X_left_right_left_right, Y_left_right_left_right)
        max_node = tree_to_webgraph_bottom(dt_left_right_left_right, X.columns, left_right_left_right_node, max_node, outfile_name)

        ## Left Right Right Branch ##
        # Train classifier left right right left
        dt_left_right_right_left = dtc.fit(X_left_right_right_left, Y_left_right_right_left)
        max_node = tree_to_webgraph_bottom(dt_left_right_right_left, X.columns, left_right_right_left_node, max_node, outfile_name)

        # Train classifier left right right right
        dt_left_right_right_right = dtc.fit(X_left_right_right_right, Y_left_right_right_right)
        max_node = tree_to_webgraph_bottom(dt_left_right_right_right, X.columns, left_right_right_right_node, max_node, outfile_name)

        ## Right Left Left Branch ##
        # Train classifier right left left left
        dt_right_left_left_left = dtc.fit(X_right_left_left_left, Y_right_left_left_left)
        max_node = tree_to_webgraph_bottom(dt_right_left_left_left, X.columns, right_left_left_left_node, max_node, outfile_name)

        # Train classifier right left left right
        dt_right_left_left_right = dtc.fit(X_right_left_left_right, Y_right_left_left_right)
        max_node = tree_to_webgraph_bottom(dt_right_left_left_right, X.columns, right_left_left_right_node, max_node, outfile_name)

        ## Right Left Right Branch ##
        # Train classifier right left right left
        dt_right_left_right_left = dtc.fit(X_right_left_right_left, Y_right_left_right_left)
        max_node = tree_to_webgraph_bottom(dt_right_left_right_left, X.columns, right_left_right_left_node, max_node, outfile_name)

        # Train classifier right left right right
        dt_right_left_right_right = dtc.fit(X_right_left_right_right, Y_right_left_right_right)
        max_node = tree_to_webgraph_bottom(dt_right_left_right_right, X.columns, right_left_right_right_node, max_node, outfile_name)

        ## Right Right Left Branch ##
        # Train classifier right right left left
        dt_right_right_left_left = dtc.fit(X_right_right_left_left, Y_right_right_left_left)
        max_node = tree_to_webgraph_bottom(dt_right_right_left_left, X.columns, right_right_left_left_node, max_node, outfile_name)

        # Train classifier right right left right
        dt_right_right_left_right = dtc.fit(X_right_right_left_right, Y_right_right_left_right)
        max_node = tree_to_webgraph_bottom(dt_right_right_left_right, X.columns, right_right_left_right_node, max_node, outfile_name)

        ## Right Right Right Branch ##
        # Train classifier right right right left
        dt_right_right_right_left = dtc.fit(X_right_right_right_left, Y_right_right_right_left)
        max_node = tree_to_webgraph_bottom(dt_right_right_right_left, X.columns, right_right_right_left_node, max_node, outfile_name)

        # Train classifier right right right right
        dt_right_right_right_right = dtc.fit(X_right_right_right_right, Y_right_right_right_right)
        max_node = tree_to_webgraph_bottom(dt_right_right_right_right, X.columns, right_right_right_right_node, max_node, outfile_name, end=True)
    
    
    
    
    
    
    elif num_splits == 3:
        ######################
        #### First split #####
        ######################
        left, right, dt_first = specify_split(X, Y, split_hierarchy.loc['first'][0])
        X_left, Y_left, X_right, Y_right = create_x_y(left, right)
        left_node, right_node, max_node = tree_to_webgraph_top(dt_first, split_hierarchy.loc['first'][0], outfile_name)

        ######################
        #### Second Split ####
        ######################
        # Split left branch
        left_left, left_right, dt_left = specify_split(X_left, Y_left, split_hierarchy.loc['left'][0])
        X_left_left, Y_left_left, X_left_right, Y_left_right = create_x_y(left_left, left_right)
        left_left_node, left_right_node, max_node = tree_to_webgraph_second(dt_left, split_hierarchy.loc['left'][0], left_node, max_node, outfile_name)

        # Split right branch
        right_left, right_right, dt_right = specify_split(X_right, Y_right, split_hierarchy.loc['right'][0])
        X_right_left, Y_right_left, X_right_right, Y_right_right = create_x_y(right_left, right_right)
        right_left_node, right_right_node, max_node = tree_to_webgraph_second(dt_right, split_hierarchy.loc['right'][0], right_node, max_node, outfile_name)

        ######################
        #### Third Split #####
        ######################
        ## Left Branch ##
        # Split left_left branch
        left_left_left, left_left_right, dt_left_left = specify_split(X_left_left, Y_left_left, split_hierarchy.loc['left_left'][0])
        X_left_left_left, Y_left_left_left, X_left_left_right, Y_left_left_right = create_x_y(left_left_left, left_left_right)
        left_left_left_node, left_left_right_node, max_node = tree_to_webgraph_second(dt_left_left, split_hierarchy.loc['left_left'][0], left_left_node, max_node, outfile_name)

        # Split left_right branch
        left_right_left, left_right_right, dt_left_right = specify_split(X_left_right, Y_left_right, split_hierarchy.loc['left_right'][0])
        X_left_right_left, Y_left_right_left, X_left_right_right, Y_left_right_right = create_x_y(left_right_left, left_right_right)
        left_right_left_node, left_right_right_node, max_node = tree_to_webgraph_second(dt_left_right, split_hierarchy.loc['left_right'][0], left_right_node, max_node, outfile_name)

        ## Right Branch ##
        # Split right_left branch
        right_left_left, right_left_right, dt_right_left = specify_split(X_right_left, Y_right_left, split_hierarchy.loc['right_left'][0])
        X_right_left_left, Y_right_left_left, X_right_left_right, Y_right_left_right = create_x_y(right_left_left, right_left_right)
        right_left_left_node, right_left_right_node, max_node = tree_to_webgraph_second(dt_right_left, split_hierarchy.loc['right_left'][0], right_left_node, max_node, outfile_name)

        # Split right_right branch
        right_right_left, right_right_right, dt_right_right = specify_split(X_right_right, Y_right_right, split_hierarchy.loc['right_right'][0])
        X_right_right_left, Y_right_right_left, X_right_right_right, Y_right_right_right = create_x_y(right_right_left, right_right_right)
        right_right_left_node, right_right_right_node, max_node = tree_to_webgraph_second(dt_right_right, split_hierarchy.loc['right_right'][0], right_right_node, max_node, outfile_name)

        ##########################################
        #### Train subsequent bottom of trees ####
        ##########################################
        dtc =  DTC(random_state=123, max_leaf_nodes=max_leaf_nodes)

        #### Train Bottom Trees ####
        ## Left Left Branch ##
        # Train classifier left left left
        dt_left_left_left = dtc.fit(X_left_left_left, Y_left_left_left)
        max_node = tree_to_webgraph_bottom(dt_left_left_left, X.columns, left_left_left_node, max_node, outfile_name)

        # Train classifier left left right
        dt_left_left_right = dtc.fit(X_left_left_right, Y_left_left_right)
        max_node = tree_to_webgraph_bottom(dt_left_left_right, X.columns, left_left_right_node, max_node, outfile_name)

        ## Left Right Branch ##
        # Train classifier left right left
        dt_left_right_left = dtc.fit(X_left_right_left, Y_left_right_left)
        max_node = tree_to_webgraph_bottom(dt_left_right_left, X.columns, left_right_left_node, max_node, outfile_name)

        # Train classifier left right right
        dt_left_right_right = dtc.fit(X_left_right_right, Y_left_right_right)
        max_node = tree_to_webgraph_bottom(dt_left_right_right, X.columns, left_right_right_node, max_node, outfile_name)

        ## Right Left Branch ##
        # Train classifier right left left
        dt_right_left_left = dtc.fit(X_right_left_left, Y_right_left_left)
        max_node = tree_to_webgraph_bottom(dt_right_left_left, X.columns, right_left_left_node, max_node, outfile_name)

        # Train classifier right left right
        dt_right_left_right = dtc.fit(X_right_left_right, Y_right_left_right)
        max_node = tree_to_webgraph_bottom(dt_right_left_right, X.columns, right_left_right_node, max_node, outfile_name)

        ## Right Right Branch ##
        # Train classifier right right left
        dt_right_right_left = dtc.fit(X_right_right_left, Y_right_right_left)
        max_node = tree_to_webgraph_bottom(dt_right_right_left, X.columns, right_right_left_node, max_node, outfile_name)

        # Train classifier right right right
        dt_right_right_right = dtc.fit(X_right_right_right, Y_right_right_right)
        max_node = tree_to_webgraph_bottom(dt_right_right_right, X.columns, right_right_right_node, max_node, outfile_name, end=True)

        
        
        
        
    elif num_splits == 2:
        ######################
        #### First split #####
        ######################
        left, right, dt_first = specify_split(X, Y, split_hierarchy.loc['first'][0])
        X_left, Y_left, X_right, Y_right = create_x_y(left, right)
        left_node, right_node, max_node = tree_to_webgraph_top(dt_first, split_hierarchy.loc['first'][0], outfile_name)

        ######################
        #### Second Split ####
        ######################
        # Split left branch
        left_left, left_right, dt_left = specify_split(X_left, Y_left, split_hierarchy.loc['left'][0])
        X_left_left, Y_left_left, X_left_right, Y_left_right = create_x_y(left_left, left_right)
        left_left_node, left_right_node, max_node = tree_to_webgraph_second(dt_left, split_hierarchy.loc['left'][0], left_node, max_node, outfile_name)

        # Split right branch
        right_left, right_right, dt_right = specify_split(X_right, Y_right, split_hierarchy.loc['right'][0])
        X_right_left, Y_right_left, X_right_right, Y_right_right = create_x_y(right_left, right_right)
        right_left_node, right_right_node, max_node = tree_to_webgraph_second(dt_right, split_hierarchy.loc['right'][0], right_node, max_node, outfile_name)

        ##########################################
        #### Train subsequent bottom of trees ####
        ##########################################
        dtc =  DTC(random_state=123, max_leaf_nodes=max_leaf_nodes)

        #### Train Bottom Trees ####
        ## Left Branch ##
        # Train classifier left left
        dt_left_left = dtc.fit(X_left_left, Y_left_left)
        max_node = tree_to_webgraph_bottom(dt_left_left, X.columns, left_left_node, max_node, outfile_name)

        # Train classifier left right
        dt_left_right = dtc.fit(X_left_right, Y_left_right)
        max_node = tree_to_webgraph_bottom(dt_left_right, X.columns, left_right_node, max_node, outfile_name)

        ## Right Branch ##
        # Train classifier right left
        dt_right_left = dtc.fit(X_right_left, Y_right_left)
        max_node = tree_to_webgraph_bottom(dt_right_left, X.columns, right_left_node, max_node, outfile_name)

        # Train classifier right right
        dt_right_right = dtc.fit(X_right_right, Y_right_right)
        max_node = tree_to_webgraph_bottom(dt_right_right, X.columns, right_right_node, max_node, outfile_name, end=True)

In [515]:
# Select only specific person types
features_pertype = features[(features['PERTYPE'] == 'PDA_Stud') | (features['PERTYPE'] == 'DA_Stud')]

In [557]:
# Split into X and Y
X = features_pertype[['AGE', 'JOBS', 'DIST', 'ACCESSIBILITY', 'CAR_SUFF', 'HINCCAT1']]

X = pd.get_dummies(X)
#X.drop(['GENDER_Male'], axis=1, inplace=True)
Y = features_pertype['IDAP']
dtc =  DTC(random_state=234, max_leaf_nodes=6)
dt_model = dtc.fit(X, Y)
with open("dt_test_6_7.txt", "w") as f:
    f = tree.export_graphviz(dt_model, out_file=f, feature_names=X.columns, 
                             class_names=['H','M','NM'],
                             rounded=True,
                             filled=True)

In [493]:
features.PERTYPE.unique()

array(['Retiree', 'FT_Worker', 'PT_Worker', 'DA_Stud', 'PreSch_Child',
       'PDA_Stud', 'Non_Worker', 'Univ_Stud'], dtype=object)

In [317]:
features.columns

Index(['SAMPN', 'PERNO', 'GENDER', 'PERTYPE', 'AGE', 'IDAP', 'CAR_SUFF',
       'HINCCAT1', 'EDUCA', 'JOBS', 'JOBTYPE', 'ACCESSIBILITY', 'DIST',
       'Retiree_NM', 'Retiree_M', 'Retiree_H', 'FT_Worker_NM', 'FT_Worker_M',
       'FT_Worker_H', 'PT_Worker_NM', 'PT_Worker_M', 'PT_Worker_H',
       'DA_Stud_NM', 'DA_Stud_M', 'DA_Stud_H', 'PreSch_Child_NM',
       'PreSch_Child_M', 'PreSch_Child_H', 'PDA_Stud_NM', 'PDA_Stud_M',
       'PDA_Stud_H', 'Non_Worker_NM', 'Non_Worker_M', 'Non_Worker_H',
       'Univ_Stud_NM', 'Univ_Stud_M', 'Univ_Stud_H'],
      dtype='object')

In [537]:
# Select only specific person types
features_pertype = features[(features['PERTYPE'] == 'PDA_Stud') | (features['PERTYPE'] == 'DA_Stud')]

In [538]:
# Split into X and Y
X = features_pertype[['AGE', 'JOBS', 'DIST', 'ACCESSIBILITY', 'CAR_SUFF', 'HINCCAT1']]

X = pd.get_dummies(X)
#X.drop(['GENDER_Male'], axis=1, inplace=True)
Y = features_pertype['IDAP']

In [549]:
# Read split hierarchy
split_hierarchy = pd.read_csv('split_hierarchy.csv', index_col=0)
max_leaf_nodes = 2

In [550]:
# Run dtc with specified contraints
specify_dtc(X, Y, split_hierarchy, max_leaf_nodes, 'dt_pertype_6_7.txt', 3)