In [None]:
import seaborn
import pandas
import matplotlib.pyplot as plt
import numpy as np

## Loading and visualizing the data

In [None]:
def load_data(split_frac=0.7):
    '''load and split into training and test.

    '''
    data = seaborn.load_dataset('iris')
    
    # split for training and test
    training = data.sample(frac=split_frac, random_state=0)
    test = data.drop(training.index)
    return training, test


In [None]:
training_data, test_data = load_data(0.2)

print('Training samples: {}'.format(len(training_data)))


In [None]:
display(training_data.head(10))

seaborn.pairplot(training_data, hue='species')
plt.show()

We first have to prepare the dataset for training. 
Scikit-learn estimators expect an array with input features and, in the case of a supervised method, an array with the corresponding targets.
Besides, the class labels are currently strings, which we'll encode with an integer.

In [None]:
from sklearn.preprocessing import LabelEncoder

def prepare_for_training(data):
    '''removes the target column from the dataframe and encodes
    classes with integers from [0, .., n-1]
    '''
    target_col = 'species'
    features = data.drop(columns=target_col)
    labels = LabelEncoder().fit_transform(data[target_col])
    return features, labels


train_features, train_labels = prepare_for_training(training_data)
print('Feature shape:', train_features.shape)
print('Label shape: ', train_labels.shape)

print(train_features.head())
print(train_labels[:5])

plt.bar(np.unique(train_labels), np.bincount(train_labels))
plt.show()

## Fitting a classifier and predicting with it

We'll start with a simple k-nearest neighbours classifier ```KNeighborsClassifier```. In scikit-learn, all estimators use

```
clf.fit(...)
```

for fitting the model to the given training data and

```
clf.predict(...)
```

or

```
clf.predict_proba(...)
```

for predicting with a fitted (i.e. trained) estimator. The parametrization of the algorithm happens typically in the constructor or through ```set_params```. 

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(6)  # TODO Replace
classifier.fit(train_features, train_labels)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
import itertools


def plot_confusion_matrix(labels_true, labels_pred):
    '''a convenience function to plot the confusion matrix of predicted and true labels,
    and run the "classification_report".

    '''
    coom = confusion_matrix(labels_true, labels_pred)

    _, axarr = plt.subplots(
        2, 2, 
        gridspec_kw=dict(width_ratios=[3, 1], height_ratios=[3, 1], hspace=0.25), 
        figsize=(8, 8), sharey='row', sharex='col')
    
    axarr[0, 0].matshow(coom)
    axarr[0, 0].set_xlabel('Predicted')
    axarr[0, 0].set_ylabel('True')
    
    color_threshold = 0.5 * (coom.max() - coom.min()) + coom.min()
    for ii, jj in itertools.product(*[range(dim) for dim in coom.shape]):
        val = coom[jj, ii]
        axarr[0, 0].text(ii, jj, '{}'.format(val),  # beware of the coordinate system flip. 
                 horizontalalignment='center', 
                 color='white' if val < color_threshold else 'black')

    axarr[0, 1].barh(np.arange(np.max(labels_true) + 1), np.bincount(labels_true))
    axarr[0, 1].set_title('True labels')
#     axarr[0, 1].invert_yaxis()

    axarr[1, 0].bar(np.arange(np.max(labels_pred) + 1), np.bincount(labels_pred))
    axarr[1, 0].set_title('Predicted labels')
    
    axarr[1, 1].axis('off')
    dx = 0.5
    axarr[0, 0].set_xlim(-dx, coom.shape[0]-dx)
    axarr[1, 0].set_xlim(-dx, coom.shape[0]-dx)
    axarr[0, 0].set_ylim(-dx, coom.shape[1]-dx)
    axarr[0, 1].set_ylim(-dx, coom.shape[1]-dx)
    
    print(classification_report(labels_true, labels_pred))
    print('Accuracy: {:1.2f}'.format(np.diag(coom).sum() / len(labels_true)))

How good is the performance on the training set?

In [None]:
plot_confusion_matrix(train_labels, 
                      classifier.predict(train_features))
plt.show()

Measuring performance on the training set is going to be overly optimistic.

We should estimate the performance with a validation split. ```scikit-learn```
actually already provides us with utility functions to run a cross-validation

In [None]:
from sklearn.model_selection import cross_validate

cv_scores = cross_validate(classifier, 
                           train_features, 
                           train_labels, 
                           cv=10,
                           return_train_score=True)

for key in ['train_score', 'test_score']:
    print('{}:\n\tmean={:1.3f}\n\tsplits=[{}]'.format(
        key, np.mean(cv_scores[key]), ', '.join('{:1.3f}'.format(val) for val in cv_scores[key])))


## Exercises

* Choose another classifier from ```scikit-learn``` ([here](https://scikit-learn.org/stable/modules/classes.html)) and replace the ```kNearestNeighbours``` in the above code with it. 
    * What are it's main hyperparameters?
    * How do they affect the outcome on the validation split?
    * What does it mean in terms of model complexity?

* Crossvalidation is a great way to optimize hyperparameters. Again, ```scikit-learn``` provides utility functions that are tailored to this. Have a look a the documentation for [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV) and use it to tune the main parameter(s) of your classifier.

In [None]:
from sklearn.model_selection import GridSearchCV

# TODO ...

## Final test

Finally, let's measure the performance on the test set. How does it compare to your training  and validation performance?

In [None]:
raise RuntimeError("Are you sure you already want to test your classifier?")  # TODO comment out

print('Test samples: {}'.format(len(test_data)))
test_features, test_labels = prepare_for_training(test_data)

print(test_features.shape, test_labels.shape)

plot_confusion_matrix(test_labels, classifier.predict(test_features))
plt.show()
