In [79]:
# Import Necessary Packages
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.metrics import roc_auc_score as rs
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
% matplotlib inline

In [80]:
# Load Dataset
df = pd.read_csv('../Data/trip_survey.csv', index_col=0)
df.head()

Unnamed: 0,SAMPN,PERNO,PERTYPE,ORIG_HOME,DEST_HOME,DOW_x,OTAZ,DTAZ,OTPURP_AGG,DTPURP_AGG,...,O_WMODE,TTTWS,CDRIV,WSTRT,WEND,STRVR,ENDVR,STUDE,work_flex_start,work_flex_end
0,3000056,1,2,1,0,3.0,2626,2627,0,9,...,,8,No,100,300,Start Time Cannot Vary,16 to 30 Minutes,No,0,1
1,3000056,1,2,1,0,3.0,2626,2627,0,1,...,,8,No,100,300,Start Time Cannot Vary,16 to 30 Minutes,No,0,1
2,3000056,1,2,1,0,3.0,2626,2628,0,9,...,,8,No,100,300,Start Time Cannot Vary,16 to 30 Minutes,No,0,1
3,3000056,2,1,1,0,3.0,2626,2676,0,1,...,,90,No,730,1900,Within 15 Minutes or Less,More than 1 Hour,No,0,1
4,3000056,3,3,1,0,3.0,2626,2627,0,8,...,,14,No,1800,2330,More than 1 Hour,More than 1 Hour,Yes - Full Time,1,1


In [81]:
df.DTRACT = df.DTRACT.astype(int)
df.DTRACT = df.DTRACT.astype(str)
df.DTRACT = df.DTRACT.apply(lambda x: x.zfill(11))

In [82]:
census_tracts = gpd.read_file('../Data/shapefiles/NYMTC_BPM_TAZ2010_CT2010/BPM_CT2010.shp')

In [83]:
census_tracts.crs = {'init' :'epsg:4326'}
census_tracts.crs

{'init': 'epsg:4326'}

In [84]:
df_ct = pd.merge(df, census_tracts, left_on='DTRACT', right_on='GEOID10')

In [85]:
len(df_ct)

103617

In [86]:
df_ct = gpd.GeoDataFrame(df_ct)
df_ct.crs = {'init' :'epsg:4326'}
df_ct.crs

{'init': 'epsg:4326'}

In [87]:
nyc = gpd.read_file('../Data/shapefiles/nynta.shp')
nyc = nyc.to_crs({'init' :'epsg:4326'})

In [88]:
nyc.head()

Unnamed: 0,BoroCode,BoroName,CountyFIPS,NTACode,NTAName,Shape_Area,Shape_Leng,geometry
0,3,Brooklyn,47,BK88,Borough Park,54005020.0,39247.228028,POLYGON ((-73.97604935657381 40.63127590568011...
1,4,Queens,81,QN52,East Flushing,29454380.0,25843.364934,POLYGON ((-73.79493246233986 40.75779803010788...
2,4,Queens,81,QN48,Auburndale,34164220.0,32446.878764,POLYGON ((-73.77573836927087 40.74332564719731...
3,4,Queens,81,QN51,Murray Hill,52488280.0,33266.904732,POLYGON ((-73.80379022888238 40.77561011182583...
4,4,Queens,81,QN27,East Elmhurst,19726850.0,19816.712116,POLYGON ((-73.86109724335753 40.76366447712022...


In [89]:
df_nyc = gpd.sjoin(nyc, df_ct, op='intersects')
len(df_nyc)

54182

In [90]:
df_nyc.head(3)

Unnamed: 0,BoroCode,BoroName,CountyFIPS,NTACode,NTAName,Shape_Area,Shape_Leng,geometry,index_right,SAMPN,...,ALAND10,AWATER10,COUNTYFP10,FUNCSTAT10,GEOID10,MTFCC10,NAME10,NAMELSAD10,STATEFP10,TRACTCE10
0,3,Brooklyn,47,BK88,Borough Park,54005020.0,39247.228028,POLYGON ((-73.97604935657381 40.63127590568011...,49275,3588150,...,169085,0,47,S,36047046400,G5020,464,Census Tract 464,36,46400
86,3,Brooklyn,47,BK46,Ocean Parkway South,17782100.0,21975.997112,"POLYGON ((-73.9707469139593 40.62562949549583,...",49275,3588150,...,169085,0,47,S,36047046400,G5020,464,Census Tract 464,36,46400
93,3,Brooklyn,47,BK42,Flatbush,45254810.0,38737.853254,POLYGON ((-73.95870789822966 40.65038727381591...,49275,3588150,...,169085,0,47,S,36047046400,G5020,464,Census Tract 464,36,46400


In [94]:
df_nyc.columns

Index([       u'BoroCode',        u'BoroName',      u'CountyFIPS',
               u'NTACode',         u'NTAName',      u'Shape_Area',
            u'Shape_Leng',        u'geometry',     u'index_right',
                 u'SAMPN',           u'PERNO',         u'PERTYPE',
             u'ORIG_HOME',       u'DEST_HOME',           u'DOW_x',
                  u'OTAZ',            u'DTAZ',      u'OTPURP_AGG',
            u'DTPURP_AGG',      u'LTMODE_AGG',      u'TRP_DEP_HR',
           u'TRP_DEP_MIN',      u'TRP_ARR_HR',     u'TRP_ARR_MIN',
              u'TRPDUR_R',          u'ACTDUR',          u'OTRACT',
                u'DTRACT',          u'PMODE1',         u'PMODE_R',
              u'PMODE_R2',        u'PMODE_R3',       u'WORK_PURP',
              u'DTPURP_R',         u'ODTPURP',        u'ODTPURP1',
              u'ODTPURP2',      u'ODTPURP2_R',            u'PLOC',
              u'WORKTRIP',           u'TOD_R',        u'TOD_PEAK',
           u'TRIPDIST_R1',     u'TRIPDIST_R2',           u'STY

In [4]:
# Select target variable and feature space
X = pd.get_dummies(df_nyc[['ODTPURP2_R', 'GENDER', 'AGE_R', 'INCOM_R', 'HHVEH', 'LIC', 'WORKS', 'TRIPDIST_R1',
                'TRPDUR_R', 'OTRACT', 'TOD_R']])
Y = df_nyc['NTAName']


X.dropna(inplace=True)
Y.dropna(inplace=True)
X_train,X_test,y_train,y_test=train_test_split(X, Y, test_size=0.3, random_state=999)

In [92]:
# Find best C parameter
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth': np.linspace(5,40,20)}
dtc = DTC()
C_param = GridSearchCV(dtc, param_grid, cv=5)
C_param.fit(X_train, y_train)
C_param.best_params_

{'max_depth': 10.526315789473685}

In [93]:
# Cross validate using best C parameter
avg_acc = []
for i in range(10):
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.33, random_state=i) #Use random_state to fix samples
    X_train_dummies = pd.get_dummies(X_train)
    X_test_dummies = pd.get_dummies(X_test)

    clf = DTC(max_depth=C_param.best_params_.values()[0]) 
    clf.fit(X_train_dummies, Y_train)

    avg_acc.append(1.0*(clf.predict(X_test_dummies)==np.asarray(Y_test)).sum()/len(Y_test))

print ("Successfully (OS) predict {}% of the modes".format(np.mean(avg_acc)*100))

Successfully (OS) predict 11.551926626% of the modes


In [8]:
Feature_importance=pd.DataFrame([list(X_train.columns),list(clf.feature_importances_)]).T
Feature_importance.columns=["variables","importance"]
Feature_importance.sort_values(by="importance",ascending=False).iloc[:15,:]

Unnamed: 0,variables,importance
5,TOD_R,0.139709
0,ODTPURP2_R,0.128062
1,INCOM_R,0.12105
3,TRPDUR_R,0.107884
4,OTRACT,0.083293
2,TRIPDIST_R1,0.0691071
6,GENDER_Female,0.0479876
7,GENDER_Male,0.0477922
9,AGE_R_55+,0.0460994
8,AGE_R_16-55 years,0.034354
