# Exercise I: Decision Tree Classifier

## Import libraries 

In [2]:
import pickle
from time import time
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn import tree
from scipy.stats import uniform, norm, randint as sp_randint
from sklearn.preprocessing import OneHotEncoder



## Load balance-scale dataset

In [3]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data', sep= ',', header= None)
dataset = 'balance-scale'

## Load and prepare Stomach Cancer dataset

In [4]:
data = pd.read_csv('cancer.csv', sep=',')
data.drop('Subject', axis=1, inplace=True)
data.drop('HISTOPATOLÓGICO', axis=1, inplace=True)
dataset = 'cancer'
list(data[0:0])

['Grupo',
 'Edad',
 'Sexo',
 'rs6983267_8q24',
 'rs1447295_8q24',
 'rs4733616_8q24',
 'rs7903146_TCF7L2',
 'rs12255372_TCF7L2',
 'rs2910164_miR146a',
 ' rs2292832_miR149']

## Verify dataset dimension

In [5]:
print("Dataset length: ", len(data))
print("Dataset shape: ", data.shape)

Dataset length:  199
Dataset shape:  (199, 10)


## Split train and test data

In [6]:
X = data.iloc[:, range(1, data.shape[1])]
Y = data.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=100)

## Train decision tree with criterion Gini index

In [7]:
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=3, min_samples_leaf=5)
clf_gini.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

## Train decision tree with criterion information gain

In [8]:
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth=3, min_samples_leaf=5)
clf_entropy.fit(X_train, y_train)


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

## Prediction

### Single instance sample dataset

In [9]:
if dataset == 'balance-data':
    clf_gini.predict([[4, 4, 3, 3]])

### Gini index

In [11]:
y_pred = clf_gini.predict(X_test)
X_test

Unnamed: 0,Edad,Sexo,rs6983267_8q24,rs1447295_8q24,rs4733616_8q24,rs7903146_TCF7L2,rs12255372_TCF7L2,rs2910164_miR146a,rs2292832_miR149
126,83,1,0,2,1,0,0,2,2
104,59,0,0,2,0,1,1,1,2
99,65,0,0,2,0,1,2,0,2
92,79,1,0,2,0,1,1,2,2
111,53,0,0,2,0,0,0,2,2
166,78,1,1,2,0,0,0,1,2
116,55,0,1,2,2,0,0,2,0
96,71,0,0,2,0,0,0,0,0
52,75,1,0,1,1,0,1,1,1
69,48,1,1,2,1,1,1,2,1


### Information gain

In [16]:
y_pred_en = clf_entropy.predict(X_test)
y_pred_en

array([1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0])

## Evaluation

### Gini index

In [17]:
print("Accuracy: ", accuracy_score(y_test, y_pred)*100)
print(classification_report(y_test, y_pred))

Accuracy:  68.33333333333333
             precision    recall  f1-score   support

          0       0.74      0.76      0.75        37
          1       0.59      0.57      0.58        23

avg / total       0.68      0.68      0.68        60



### Information gain

In [18]:
print("Accuracy: ", accuracy_score(y_test, y_pred_en)*100)
print(classification_report(y_test, y_pred))

Accuracy:  68.33333333333333
             precision    recall  f1-score   support

          0       0.74      0.76      0.75        37
          1       0.59      0.57      0.58        23

avg / total       0.68      0.68      0.68        60



# Exercise II: KNN

## Train KNN

In [8]:
knn_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=1)
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

## Predict

In [9]:
y_pred_knn = knn_clf.predict(X_test)

## Evaluate

In [10]:
labels = sorted(list(set(y_pred_knn) | set(y_test)))
cm = confusion_matrix(y_test, y_pred_knn, labels=labels)

In [11]:
print(classification_report(y_test, y_pred_knn))

total = len(y_test)
n_cls = len(labels)

print("Accuracy: ", accuracy_score(y_test, y_pred_knn)*100, "\n")
for i in range(n_cls):
    tp = cm[i, i]
    fp = sum(cm[j, i] for j in range(n_cls) if i != j)
    tn = sum(cm[j, j] for j in range(n_cls) if i != j)
    fn = sum(cm[i, j] for j in range(n_cls) if i != j)
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    print("Class ", labels[i])
    print("------------")
    print("Sensitivity: ", sensitivity*100)
    print("Specificity: ", specificity*100)
    print()


             precision    recall  f1-score   support

          0       0.68      0.46      0.55        37
          1       0.43      0.65      0.52        23

avg / total       0.58      0.53      0.54        60

Accuracy:  53.333333333333336 

Class  0
------------
Sensitivity:  45.94594594594595
Specificity:  65.21739130434783

Class  1
------------
Sensitivity:  65.21739130434783
Specificity:  45.94594594594595



## Random Forest

In [12]:
rf_clf= RandomForestClassifier()
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

             precision    recall  f1-score   support

          0       0.71      0.54      0.62        37
          1       0.47      0.65      0.55        23

avg / total       0.62      0.58      0.59        60



### RandomSearch and GridSearch

In [13]:
clf = RandomForestClassifier()

# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 9),
              "min_samples_split": sp_randint(2, 9),
              "min_samples_leaf": sp_randint(1, 9),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 9],
              "min_samples_split": [2, 3, 9],
              "min_samples_leaf": [1, 3, 9],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X_train, y_train)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)


RandomizedSearchCV took 2.93 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.633 (std: 0.027)
Parameters: {'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 6, 'criterion': 'entropy', 'bootstrap': False, 'max_features': 2}

Model with rank: 2
Mean validation score: 0.626 (std: 0.044)
Parameters: {'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 3, 'criterion': 'gini', 'bootstrap': True, 'max_features': 4}

Model with rank: 2
Mean validation score: 0.626 (std: 0.085)
Parameters: {'max_depth': None, 'min_samples_split': 7, 'min_samples_leaf': 3, 'criterion': 'entropy', 'bootstrap': True, 'max_features': 8}

GridSearchCV took 30.50 seconds for 216 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.669 (std: 0.091)
Parameters: {'max_depth': None, 'min_samples_split': 3, 'min_samples_leaf': 1, 'criterion': 'gini', 'bootstrap': True, 'max_features': 3}

Model with rank: 1
Mean validation score: 0.669 

In [14]:
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))

y_pred = random_search.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.74      0.78      0.76        37
          1       0.62      0.57      0.59        23

avg / total       0.70      0.70      0.70        60

             precision    recall  f1-score   support

          0       0.64      0.38      0.47        37
          1       0.39      0.65      0.49        23

avg / total       0.54      0.48      0.48        60



### Pickle

In [15]:
pickle.dump(clf_gini, open('model', 'wb'))

In [27]:
data = np.array([11, 11, 1, 1, 1, 11, 1, 1, 1]).reshape(1, -1)
rf_clf.predict(data)

array([1])