# Python Machine Learning In Biology:
# Decision Trees

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

### Build a decision tree classifier for evaluating balance tip’s direction.

#### Import the balance data 

In [2]:
balance = pd.read_csv('data/balance.csv')

In [3]:
balance.head()

Unnamed: 0,class,left_weight,left_distance,right_weight,right_distance
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [4]:
balance.shape

(625, 5)

#### Create variables for X and y

In [5]:
X = balance.iloc[:, 1:5]
X.head()

Unnamed: 0,left_weight,left_distance,right_weight,right_distance
0,1,1,1,1
1,1,1,1,2
2,1,1,1,3
3,1,1,1,4
4,1,1,1,5


In [7]:
y = balance['class']

#### Split the dataset into training and testing datasets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

#### Create a decision tree using gini criterion, max_depth of 3, and min_samples_leaf of 5.

In [9]:
clf_gini = DecisionTreeClassifier(criterion='gini', max_depth = 3, 
                                  min_samples_leaf=5)

In [10]:
clf_gini.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

#### Prediction

In [11]:
y_pred = clf_gini.predict(X_test)

#### Get accuracy score

In [12]:
print(accuracy_score(y_test, y_pred))

0.7133757961783439


#### Create a decision tree using entropy criterion, max_depth of 3, and min_samples_leaf of 5.

In [17]:
clf_entropy = DecisionTreeClassifier(criterion="entropy")

In [18]:
clf_entropy.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

#### Prediction

In [19]:
y_pred_en = clf_entropy.predict(X_test)

#### Get accuracy score

In [20]:
print(accuracy_score(y_test, y_pred_en))

0.7388535031847133


### Visualize decision trees

Graphviz is an open source visualization library that can help us visualize decision trees. Graphviz is used a lot for visualizing networks (graphs). 

#### Convert trained decision trees into `.txt` files

In [21]:
with open("clf_gini.txt", "w") as f:
    f = tree.export_graphviz(clf_gini, out_file=f)

In [22]:
with open("clf_entropy.txt", "w") as f:
    f = tree.export_graphviz(clf_entropy, out_file=f)

To visualize the decision tree, we will open the .txt files and copy the contents of the file into the graphiviz web portal. http://webgraphviz.com

#### Convert the classifier into graphviz object and store the contents into .dot file.

In [23]:
with open("clf_gini.dot", "w") as f:
    f = tree.export_graphviz(clf_gini, out_file=f)

In [24]:
with open("clf_entropy.dot", "w") as f:
    f = tree.export_graphviz(clf_entropy, out_file=f)

#### Convert the dot file into a pdf file
These are bash commands

In [25]:
!dot -Tpdf clf_gini.dot -o clf_gini.pdf

In [26]:
!dot -Tpdf clf_entropy.dot -o clf_entropy.pdf

####  Preview the pdf file
Also a bash command

In [27]:
!open -a preview clf_gini.pdf

In [28]:
!open -a preview clf_entropy.pdf

### Independent Work
Build a decision tree for the diabetes dataset.