# Decision tree classifier

In this notebook, we will train a Decision tree classifier on a subset of the data. We will use grid-search to optimize the parameters.

In [2]:
import sys
sys.path.append('../scripts')

import numpy as np
import pandas as pd
import helpers_models as hm
from transforms import *
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

First, load a sample of the data.

In [3]:
data = pd.read_hdf('../data/pivot_numbers_only.h5', 'data').sample(2000000)

In [4]:
first_week, last_week = data['tweets'].columns.min(), data['tweets'].columns.max()

In [5]:
data = hm.make_target(data, last_week)

In [6]:
data.shape

(2000000, 53)

In [7]:
first_week

23

In [8]:
last_week

36

Split the data into train and test.

In [9]:
train, test = train_test_split(data)

Now make a pipeline for transforming and predicting.

In [10]:
pipeline = Pipeline([
    ('limiter', WeeksLimiter(first_week=25, target_week=last_week)),
    ('normal', Normalizer()),
    ('decay', TimeDecayApplier(target_week=last_week)),
    ('tree', DecisionTreeClassifier())
])

Now let's check the params of the pipeline for grid-search to optimize.

In [11]:
pipeline.get_params()

{'decay': TimeDecayApplier(skip=False, target_week=36),
 'decay__skip': False,
 'decay__target_week': 36,
 'limiter': WeeksLimiter(first_week=25, target_week=36),
 'limiter__first_week': 25,
 'limiter__target_week': 36,
 'normal': Normalizer(skip=False),
 'normal__skip': False,
 'steps': [('limiter', WeeksLimiter(first_week=25, target_week=36)),
  ('normal', Normalizer(skip=False)),
  ('decay', TimeDecayApplier(skip=False, target_week=36)),
  ('tree',
   DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
               max_features=None, max_leaf_nodes=None,
               min_impurity_split=1e-07, min_samples_leaf=1,
               min_samples_split=2, min_weight_fraction_leaf=0.0,
               presort=False, random_state=None, splitter='best'))],
 'tree': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features=None, max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_

In [12]:
params = {
    'limiter__first_week': [28, 30, 32],
    'normal__skip': [False, True],
    'decay__skip': [False, True],
    'tree__max_depth': [3, 6]
}

In [13]:
model = GridSearchCV(pipeline, params, n_jobs=-1, pre_dispatch='2*n_jobs', verbose=2, error_score=0)

The next step is to train the model and optimize the params.

In [14]:
%%time
model.fit(train.drop('target', axis=1, level=0), train['target'])

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] decay__skip=False, limiter__first_week=28, normal__skip=False, tree__max_depth=3 
[CV] decay__skip=False, limiter__first_week=28, normal__skip=False, tree__max_depth=3 
[CV] decay__skip=False, limiter__first_week=28, normal__skip=False, tree__max_depth=3 
[CV] decay__skip=False, limiter__first_week=28, normal__skip=False, tree__max_depth=6 
[CV] decay__skip=False, limiter__first_week=28, normal__skip=False, tree__max_depth=6 
[CV] decay__skip=False, limiter__first_week=28, normal__skip=False, tree__max_depth=6 
[CV]  decay__skip=False, limiter__first_week=28, normal__skip=False, tree__max_depth=3, total=  11.3s
[CV]  decay__skip=False, limiter__first_week=28, normal__skip=False, tree__max_depth=3, total=  12.6s
[CV] decay__skip=False, limiter__first_week=28, normal__skip=True, tree__max_depth=3 
[CV] decay__skip=False, limiter__first_week=28, normal__skip=True, tree__max_depth=3 
[CV]  decay__skip=False, limiter__first_w

[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.7min


[CV]  decay__skip=False, limiter__first_week=30, normal__skip=True, tree__max_depth=6, total=  52.2s
[CV] decay__skip=False, limiter__first_week=32, normal__skip=True, tree__max_depth=3 
[CV]  decay__skip=False, limiter__first_week=32, normal__skip=False, tree__max_depth=3, total=  27.5s
[CV] decay__skip=False, limiter__first_week=32, normal__skip=True, tree__max_depth=3 
[CV]  decay__skip=False, limiter__first_week=32, normal__skip=False, tree__max_depth=6, total=  21.1s
[CV] decay__skip=False, limiter__first_week=32, normal__skip=True, tree__max_depth=3 
[CV]  decay__skip=False, limiter__first_week=32, normal__skip=False, tree__max_depth=6, total=  18.5s
[CV] decay__skip=False, limiter__first_week=32, normal__skip=True, tree__max_depth=6 
[CV]  decay__skip=False, limiter__first_week=32, normal__skip=False, tree__max_depth=6, total=  17.1s
[CV] decay__skip=False, limiter__first_week=32, normal__skip=True, tree__max_depth=6 
[CV] decay__skip=False, limiter__first_week=32, normal__skip=

[CV]  decay__skip=True, limiter__first_week=32, normal__skip=True, tree__max_depth=6, total=  11.5s


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  7.3min finished


CPU times: user 4min 2s, sys: 34.5 s, total: 4min 37s
Wall time: 7min 37s


GridSearchCV(cv=None, error_score=0,
       estimator=Pipeline(steps=[('limiter', WeeksLimiter(first_week=25, target_week=36)), ('normal', Normalizer(skip=False)), ('decay', TimeDecayApplier(skip=False, target_week=36)), ('tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'limiter__first_week': [28, 30, 32], 'normal__skip': [False, True], 'decay__skip': [False, True], 'tree__max_depth': [3, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)

Finally, use the model to predict test values and scores. We will use the test data for predictions.

In [15]:
%%time
predicted = model.predict(test.drop('target', axis=1, level=0))
report = classification_report(test['target'], predicted)

CPU times: user 2.71 s, sys: 1.83 s, total: 4.54 s
Wall time: 4.51 s


In [16]:
print(report)

             precision    recall  f1-score   support

      False       0.92      0.98      0.95    441776
       True       0.72      0.32      0.44     58224

avg / total       0.89      0.91      0.89    500000



Also check the best parameters grid-search found.

In [17]:
model.best_params_

{'decay__skip': False,
 'limiter__first_week': 30,
 'normal__skip': True,
 'tree__max_depth': 6}