In [None]:
import numpy as np
import pandas as pd
from scipy.io.arff import loadarff 

from sklearn import tree, model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

## Load Dataset and pre-process data

In [None]:
!cat datasets-UCI/UCI/diabetes.arff

In [None]:
raw_data = loadarff('datasets-UCI/UCI/diabetes.arff')
df = pd.DataFrame(raw_data[0])

In [None]:
def map_to_label(class_in):
    if 'positive' in str(class_in):
        return 1.0
    else:
        return 0.0

df['label'] = df['class'].apply(map_to_label)
df.head()

## Data visualization

In [None]:
# Compute the correlation matrix
corr = df.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-0.5, vmax=0.5, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, 
            annot=True, fmt=".2f")

plt.savefig('q3_a_corr.eps', format='eps')

plt.show()

In [None]:
sns.pairplot(df, hue="label")
plt.savefig('q3_pair.eps', format='eps')
plt.show()

## Data splitting

In [None]:
# Features
x = df.drop(['class', 'label'], axis=1)  
# Labels
y = df['label']
# Split dataset into training and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size = 0.3, 
                                                    random_state = 7)
# Perform 10-fold cross validation on training/val set
kfold = model_selection.KFold(n_splits=10, random_state = 7)

### SKlearn Decision Tree (CART model)

In [None]:
# Perform grid search for criterion, max_depth, 
# min_samples_leaf and min_samples_split
# Save greatest validation results performed on 10-fold cross validation
# Use mean score as performance metric
top_mean = -999
for crit in ['gini', 'entropy']:
    for md in range(1,12):
        for msl in range(1,12):
            for mss in range(2, 5):
                clf = tree.DecisionTreeClassifier(criterion = crit, 
                                                  max_depth=md,
                                                  min_samples_leaf=msl, 
                                                  min_samples_split=mss)
                results = model_selection.cross_val_score(clf, 
                                                          x_train, 
                                                          y_train, 
                                                          cv=kfold)
                if results.mean() >= top_mean:
                    top_mean = results.mean()
                    t_std = results.std()
                    best_params = [crit, md, msl, mss]

# Print best results
print(top_mean, t_std)
# Print chosen parameters
print(best_params)

In [None]:
# Apply chosen hyper parameters and run on test set
clf = tree.DecisionTreeClassifier(criterion = best_params[0], 
                                  max_depth=best_params[1], 
                                  min_samples_leaf=best_params[2], 
                                  min_samples_split=best_params[3])
# Fit on training data
clf.fit(x_train, y_train)
# Score on test data
y_pred = clf.predict(x_test)
# Calculate accuracy
print('Acc:', accuracy_score(y_test, y_pred))

In [None]:
# Sort features by importance
feature_importance = [(importance, name) for name, importance 
                      in zip(x_train.columns, clf.feature_importances_)]
s_feature_importance = sorted(feature_importance, reverse=True)
for importance, feature in s_feature_importance:
    print('{0:.2f} :: {1}'.format(importance, feature))

In [None]:
def plot_confusion(y_test, y_pred, norm=True):
    cnf_matrix = confusion_matrix(y_test, y_pred)
    if norm:
        cnf_matrix = cnf_matrix.astype('float')/cnf_matrix.sum(axis=1)[:,np.newaxis]
        sns.heatmap(cnf_matrix, annot=True, fmt=".2f", cmap="YlGnBu")
    else:
        sns.heatmap(cnf_matrix, annot=True, fmt="d", cmap="YlGnBu")
    plt.ylabel('Classe real')
    plt.xlabel('Classe predita')
    plt.savefig('q3_a_conf.eps', format='eps')
    plt.show()

In [None]:
plot_confusion(y_test, y_pred, norm=True)

In [None]:
# Print features
for i, feature in enumerate(x_train.columns):
    print('X{} :: {}'.format(i, feature))
# Print decision tree
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())
graph.write_png("q3_a_tree.png")