In [1]:
# Install graphviz binaries
!which dot >/dev/null || apt-cache show graphviz >/dev/null || apt-get update
!which dot >/dev/null || apt-get -y install graphviz

# Install graphviz python wrapper
!pip install graphviz

[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
%run Training_Helpers.ipynb

import sklearn.tree
import pandas as pd
import copy
import graphviz
import IPython.display

In [3]:
# See https://github.com/scikit-learn/scikit-learn/issues/10810

def prune_decision_tree_inplace(tree):
    dat = tree.tree_
    nodes = range(0, dat.node_count)
    ls = dat.children_left
    rs = dat.children_right
    classes = [[list(e).index(max(e)) for e in v] for v in dat.value]

    leaves = [(ls[i] == rs[i]) for i in nodes]

    LEAF = -1
    for i in reversed(nodes):
        if leaves[i]:
            continue
        if leaves[ls[i]] and leaves[rs[i]] and classes[ls[i]] == classes[rs[i]]:
            ls[i] = rs[i] = LEAF
            leaves[i] = True

def prune_decision_tree(tree):
    tree = copy.deepcopy(tree)
    prune_decision_tree_inplace(tree)
    return tree

In [4]:
def show_tree(model, fullSize = False):
    dot = sklearn.tree.export_graphviz(
        model,
        out_file           = None,
        feature_names      = model.feature_names,
        class_names        = model.class_names,
        filled             = True,
        rounded            = True,
        special_characters = True,
    )
    s = graphviz.Source(dot)
    if fullSize:
        display(s)
    else:
        s.format = 'png'
        display(IPython.display.Image(data = s.pipe()))

In [5]:
def train_decision_tree(
    make_model_fn,
    train_data,
    test_data,
    input_fn,
    output_fn,

    prune = False,
    class_names = [],
):
    train_in = input_fn(train_data)
    train_out = output_fn(train_data)
    test_in = input_fn(test_data)
    test_out = output_fn(test_data)

    model = make_model_fn(train_in, train_out)
    model.fit(train_in, train_out)
    if prune:
        prune_decision_tree_inplace(model)

    train_accuracy = model.score(train_in, train_out)
    test_accuracy = model.score(test_in, test_out)

    print('  train: accuracy = {:.4}    test: accuracy = {:.4}'.format(
        train_accuracy,
        test_accuracy,
    ))

    model.feature_names = list(train_in.columns.values)
    model.class_names = class_names
    return TrainingResult(
        model,
        train_accuracy,
        test_accuracy,
    )

In [6]:
def train_decision_trees(
    make_model_fn,
    train_data,
    test_data,
    input_fns,
    output_fn,
    **trainArgs
):
    return train_models(
        train_decision_tree,
        make_model_fn,
        train_data,
        test_data,
        input_fns,
        output_fn,
        **trainArgs
    )