# Exercise I: Decision Tree Classifier

## Import libraries 

In [30]:
import pickle
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn import tree

## Load balance-scale dataset

In [2]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data', sep= ',', header= None)
dataset = 'balance-scale'

## Load and prepare Stomach Cancer dataset

In [3]:
data = pd.read_csv('cancer.csv', sep=',')
data.drop('Subject', axis=1, inplace=True)
data.drop('HISTOPATOLÓGICO', axis=1, inplace=True)
dataset = 'cancer'

## Verify dataset dimension

In [4]:
print("Dataset length: ", len(data))
print("Dataset shape: ", data.shape)

Dataset length:  199
Dataset shape:  (199, 10)


## Split train and test data

In [5]:
X = data.iloc[:, range(1, data.shape[1])]
Y = data.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=100)

## Train decision tree with criterion Gini index

In [47]:
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=3, min_samples_leaf=5)
clf_gini_gscv = GridSearchCV(estimator=clf_gini, param_grid={
    'random_state': list(range(99, 101, 1)),
    'max_depth': list(range(1, 3, 1)),
    'min_samples_leaf': list(range(1, 4, 1)),
})
clf_gini.fit(X_train, y_train)
clf_gini_gscv.fit(X_train, y_train)

KeyboardInterrupt: 

## Train decision tree with criterion information gain

In [None]:
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth=3, min_samples_leaf=5)
clf_entropy_gscv = GridSearchCV(estimator=clf_entropy, param_grid={
    'random_state': list(range(0, 201, 1)),
    'max_depth': list(range(1, 10, 1)),
    'min_samples_leaf': list(range(1, 10, 1)),
})
clf_entropy.fit(X_train, y_train)
clf_entropy_gscv.fit(X_train, y_train)

## Prediction

### Single instance sample dataset

In [37]:
if dataset == 'balance-data':
    clf_gini.predict([[4, 4, 3, 3]])

### Gini index

In [39]:
y_pred = clf_gini.predict(X_test)
y_pred_gscv = clf_gini_gscv.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0])

### Information gain

In [40]:
y_pred_en = clf_entropy.predict(X_test)
y_pred_en_gscv = clf_entropy_gscv.predict(X_test)
y_pred_en

array([1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0])

## Evaluation

### Gini index

In [41]:
print("Accuracy: ", accuracy_score(y_test, y_pred)*100)
print(classification_report(y_test, y_pred))
print("Accuracy (Grid search CV) ", accuracy_score(y_test, y_pred_gscv)*100)
print(classification_report(y_test, y_pred_gscv))

Accuracy:  68.33333333333333
             precision    recall  f1-score   support

          0       0.74      0.76      0.75        37
          1       0.59      0.57      0.58        23

avg / total       0.68      0.68      0.68        60



### Information gain

In [42]:
print("Accuracy: ", accuracy_score(y_test, y_pred_en)*100)
print(classification_report(y_test, y_pred))
print("Accuracy (Grid search CV) ", accuracy_score(y_test, y_pred_en_gscv)*100)
print(classification_report(y_test, y_pred_en_gscv))

Accuracy:  68.33333333333333
             precision    recall  f1-score   support

          0       0.74      0.76      0.75        37
          1       0.59      0.57      0.58        23

avg / total       0.68      0.68      0.68        60



# Exercise II: KNN

## Train KNN

In [23]:
knn_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=1)
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

## Predict

In [24]:
y_pred_knn = knn_clf.predict(X_test)

## Evaluate

In [25]:
labels = sorted(list(set(y_pred_knn) | set(y_test)))
cm = confusion_matrix(y_test, y_pred_knn, labels=labels)

In [26]:
print(classification_report(y_test, y_pred))

total = len(y_test)
n_cls = len(labels)

print("Accuracy: ", accuracy_score(y_test, y_pred_knn)*100, "\n")
for i in range(n_cls):
    tp = cm[i, i]
    fp = sum(cm[j, i] for j in range(n_cls) if i != j)
    tn = sum(cm[j, j] for j in range(n_cls) if i != j)
    fn = sum(cm[i, j] for j in range(n_cls) if i != j)
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    print("Class ", labels[i])
    print("------------")
    print("Sensitivity: ", sensitivity*100)
    print("Specificity: ", specificity*100)
    print()


             precision    recall  f1-score   support

          0       0.74      0.76      0.75        37
          1       0.59      0.57      0.58        23

avg / total       0.68      0.68      0.68        60

Accuracy:  53.333333333333336 

Class  0
------------
Sensitivity:  45.94594594594595
Specificity:  65.21739130434783

Class  1
------------
Sensitivity:  65.21739130434783
Specificity:  45.94594594594595



In [None]:
## Random Forest

In [28]:
rf_clf= RandomForestClassifier()
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

             precision    recall  f1-score   support

          0       0.71      0.68      0.69        37
          1       0.52      0.57      0.54        23

avg / total       0.64      0.63      0.64        60

