# Tree-based classifier

Include all possible dimensions and train a treebased classifier

In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import metrics
from sklearn import utils
from sklearn.tree import DecisionTreeClassifier

In [4]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.tree import export_graphviz
import graphviz


## Load Data

In [5]:
df = pd.read_csv('/Users/leima/data_store/rideindego_all.csv')
df['date'] = pd.to_datetime(df.date)
df_weather = pd.read_csv('data/philadelphia_weather.csv')[['date', 'hour', 'avg_temp', 'avg_humid', 'avg_wind_speed', 'avg_pressure']]
df_weather['date'] = pd.to_datetime(df_weather.date)
df = pd.merge(df, df_weather, how='left', left_on=['date', 'hour'], right_on=['date', 'hour'])

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df['date'] = pd.to_datetime(df.date)

In [6]:
df.head()

Unnamed: 0,bike_id,bike_type,duration,end_lat,end_lon,end_station_id,end_time,passholder_type,plan_duration,start_lat,...,trip_id,trip_route_category,date,hour,weekday,month,avg_temp,avg_humid,avg_wind_speed,avg_pressure
0,3640,standard,4.0,39.9384,-75.173271,3064,7/1/2016 0:11,Indego30,30,39.946331,...,25183529,One Way,2016-07-01,0,4,7,78.1,58.0,9.2,1016.6
1,3349,standard,8.0,39.94138,-75.145638,3026,7/1/2016 0:18,Indego30,30,39.95295,...,25183528,One Way,2016-07-01,0,4,7,78.1,58.0,9.2,1016.6
2,5279,standard,17.0,39.933151,-75.162483,3034,7/1/2016 0:28,Indego30,30,39.93082,...,25183527,One Way,2016-07-01,0,4,7,78.1,58.0,9.2,1016.6
3,5198,standard,25.0,39.94138,-75.145638,3026,7/1/2016 0:38,Walk-up,0,39.94138,...,25183526,Round Trip,2016-07-01,0,4,7,78.1,58.0,9.2,1016.6
4,3382,standard,45.0,39.947109,-75.166183,3010,7/1/2016 0:58,Walk-up,0,39.954239,...,25183525,One Way,2016-07-01,0,4,7,78.1,58.0,9.2,1016.6


In [7]:
df['duration_original'] = df.duration

In [8]:
df['duration'] = df.duration.apply(lambda x: int(x/5)*5 )

In [9]:
df.head()

Unnamed: 0,bike_id,bike_type,duration,end_lat,end_lon,end_station_id,end_time,passholder_type,plan_duration,start_lat,...,trip_route_category,date,hour,weekday,month,avg_temp,avg_humid,avg_wind_speed,avg_pressure,duration_original
0,3640,standard,0,39.9384,-75.173271,3064,7/1/2016 0:11,Indego30,30,39.946331,...,One Way,2016-07-01,0,4,7,78.1,58.0,9.2,1016.6,4.0
1,3349,standard,5,39.94138,-75.145638,3026,7/1/2016 0:18,Indego30,30,39.95295,...,One Way,2016-07-01,0,4,7,78.1,58.0,9.2,1016.6,8.0
2,5279,standard,15,39.933151,-75.162483,3034,7/1/2016 0:28,Indego30,30,39.93082,...,One Way,2016-07-01,0,4,7,78.1,58.0,9.2,1016.6,17.0
3,5198,standard,25,39.94138,-75.145638,3026,7/1/2016 0:38,Walk-up,0,39.94138,...,Round Trip,2016-07-01,0,4,7,78.1,58.0,9.2,1016.6,25.0
4,3382,standard,45,39.947109,-75.166183,3010,7/1/2016 0:58,Walk-up,0,39.954239,...,One Way,2016-07-01,0,4,7,78.1,58.0,9.2,1016.6,45.0


In [10]:
df = df[df.bike_type == 'standard']
coi = [
    'plan_duration', 
    'trip_route_category', 'hour', 'weekday', 'month', 'avg_temp', 'avg_wind_speed', 'avg_pressure'
]
#avg_humid is correlated to the hour of the day
cot = ['duration']
df = df[coi+cot]
df.dropna(inplace=True)

In [11]:
df.head()

Unnamed: 0,plan_duration,trip_route_category,hour,weekday,month,avg_temp,avg_wind_speed,avg_pressure,duration
0,30,One Way,0,4,7,78.1,9.2,1016.6,0
1,30,One Way,0,4,7,78.1,9.2,1016.6,5
2,30,One Way,0,4,7,78.1,9.2,1016.6,15
3,0,Round Trip,0,4,7,78.1,9.2,1016.6,25
4,0,One Way,0,4,7,78.1,9.2,1016.6,45


In [12]:
for i in df.columns:
    print(
        i, ': ',
        df[i].isnull().any()
    )

plan_duration :  False
trip_route_category :  False
hour :  False
weekday :  False
month :  False
avg_temp :  False
avg_wind_speed :  False
avg_pressure :  False
duration :  False


In [13]:
scaler = StandardScaler()

In [15]:
X = df[coi]
y = df[cot]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3, random_state=42
    #,stratify=Y
)

In [16]:
class MultiColumnLabelEncoder:
    def __init__(self, encoders = None, columns = None):
        self.columns = columns # array of column names to encode
        if encoders:
            self.encoders = encoders
        else:
            self.encoders = {}

    def fit(self,X,y=None):
        self.check_encoders = []
        return self # not relevant here

    def transform(self, X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                _le = LabelEncoder()
                if self.encoders.get(col):
                    output[col] = self.encoders.get(col).transform(output[col])
                else:
                    output[col] = _le.fit_transform(output[col])
                    print(f'1. preparing encoder for {col}')
                    self.encoders[col] = _le
                    self.check_encoders.append({col: _le})
        else:
            for colname,col in output.iteritems():
                _le = LabelEncoder()
                if self.encoders.get(col):
                    output[colname] = self.encoders.get(col).transform(col)
                else:
                    output[colname] = _le.fit_transform(col)
                    print(f'2. preparing encoder for {col}')
                    self.encoders[col] = _le
                    self.check_encoders.append({col: _le})
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [17]:
encoders_obj = MultiColumnLabelEncoder(columns=coi+['duration'])
encoders_obj.fit_transform(df)
dt_encoders = encoders_obj.encoders

1. preparing encoder for plan_duration
1. preparing encoder for trip_route_category
1. preparing encoder for hour
1. preparing encoder for weekday
1. preparing encoder for month
1. preparing encoder for avg_temp
1. preparing encoder for avg_wind_speed
1. preparing encoder for avg_pressure
1. preparing encoder for duration


In [18]:
coi

['plan_duration',
 'trip_route_category',
 'hour',
 'weekday',
 'month',
 'avg_temp',
 'avg_wind_speed',
 'avg_pressure']

In [19]:
dt_classifier = GridSearchCV(
    DecisionTreeClassifier(random_state=42), cv=3, 
    param_grid={"max_depth": [1,2,3,4,5,6,7,8]}
)

pipeline_steps = [
    ('encoding', MultiColumnLabelEncoder(columns=coi, encoders=dt_encoders)),
    ('dt_classifier', dt_classifier)
    # add more pipeline steps as needed
]

In [20]:
model = Pipeline(pipeline_steps)

In [31]:
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('encoding',
                 <__main__.MultiColumnLabelEncoder object at 0x1a27621550>),
                ('dt_classifier',
                 GridSearchCV(cv=3, error_score='raise-deprecating',
                              estimator=DecisionTreeClassifier(class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features=None,
                                                               max_leaf_nodes=None,
                                                               min_impurity_decrease=0.0,
                                                               min_impurity_split=None,
                                                               min_samples_leaf=1,
                                                               min_samples_split=2,
                

In [32]:
model.score(X_test[:20000], y_test[:20000])

0.3242

In [33]:
model.score(X_train, y_train)

0.32158081103011454

In [34]:
X_train_transformed = pd.DataFrame()
for col in X_train.columns:
    X_train_transformed[col] = dt_encoders.get(col).transform(X_train[col].values)
    print(col)

plan_duration
trip_route_category
hour
weekday
month
avg_temp
avg_wind_speed
avg_pressure


In [35]:
model.named_steps

duration


In [36]:
print(
    classification_report(y_test, model.predict(X_test))
)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.30      0.26      0.27     62092
           5       0.35      0.94      0.51    232066
          10       0.21      0.06      0.09    191682
          15       0.16      0.07      0.09    104679
          20       0.10      0.00      0.00     57605
          25       0.12      0.02      0.03     33666
          30       0.10      0.00      0.01     20255
          35       0.00      0.00      0.00     13516
          40       0.00      0.00      0.00      9882
          45       0.00      0.00      0.00      8183
          50       0.00      0.00      0.00      7466
          55       0.00      0.00      0.00      6034
          60       0.00      0.00      0.00      4112
          65       0.00      0.00      0.00      3023
          70       0.00      0.00      0.00      2477
          75       0.00      0.00      0.00      2233
          80       0.00      0.00      0.00      1953
          85       0.00    

In [37]:
model.named_steps['dt_classifier'].best_params_

{'max_depth': 7}

In [43]:
dt_classifier_n = DecisionTreeClassifier(random_state=42, max_depth=7)

dt_classifier_n.fit(X_train_transformed[:100000], y_train_transformed[:100000])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [44]:
for name, importance in zip(X_train.columns, dt_classifier_6.feature_importances_):
    print(name, f'{importance:.2}')

plan_duration 0.16
trip_route_category 0.12
hour 0.034
weekday 0.13
month 0.1
avg_temp 0.21
avg_wind_speed 0.14
avg_pressure 0.1


In [140]:
export_graphviz(
    dt_classifier_6, 
    feature_names=X_train.columns,
    out_file='dt_classifier_6.dot'
)

In [154]:
graph = graphviz.Source(
    export_graphviz(
    dt_classifier_6, 
    feature_names=X_train.columns,
    out_file=None
)
)

In [155]:
graph.format = 'png'

In [156]:
graph.render('dt_classifier_6.dot')

'dt_classifier_6.dot.png'

### Tests

In [None]:
df_test = X_test.sample(n=2)

In [None]:
df_test.info()

In [None]:
model.predict(df_test)

In [None]:
'{}'.format(model.predict(df_test))

In [None]:
scaler.inverse_transform(model.predict(df_test))

In [None]:
df_test = X_test.sample(n=1)

In [None]:
df_test.to_json(orient='records')

In [None]:
record = json.loads('{"passholder_type":"Indego30","trip_route_category":"One Way","hour":15,"weekday":3,"month":6}')

In [None]:
df_test_recon = pd.DataFrame.from_records([record])

In [None]:
df_test_recon