# ___

# [ Machine Learning in Geosciences ]

**Department of Applied Geoinformatics and Carthography, Charles University** 

*Lukas Brodsky lukas.brodsky@natur.cuni.cz*

    
___



# Classification Performance Measures

Goal: run classification algorithms and evaluate the models with set of performance measures and interpret the quality.

Content: 

    * training binary classifier; 
    * choosing appropriate metric for given task; 
    * evaluating classifiers using cross-validation (and StratifiedKFOld); 
    * selection of precision / recall tradeoff that fits the needs; 
    * models comparison using ROC curve (and AUC scores); 



### Lab setup

In [None]:
# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "./"
if os.path.isdir(PROJECT_ROOT_DIR):
    print('Ok continue.')
else:
    print('Nok, set correct path to your project directory!')


### MNIST

Use `fetch_openml()` to read MNIST data. MNIST (Modified National Institute of Standards and Technology database) is a large database of handwritten digits that is commonly used for training various image processing systems. 

Warning: download data takes a while! 

In [None]:
# WARNING: this step tekes long!
# Read data 
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.target = mnist.target.astype(np.int8) 
# mnist["data"], mnist["target"]

In [None]:
# function soriting the records
def sort_by_target(mnist):
    reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:, 1]
    reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]
    mnist_data = mnist.data.to_numpy()
    mnist_target = mnist.target.to_numpy()
    mnist_data[:60000, :] = mnist_data[reorder_train, :]
    mnist_target[:60000] = mnist_target[reorder_train]
    mnist_data[60000:, :] = mnist_data[reorder_test + 60000, :]
    mnist_target[60000:] = mnist_target[reorder_test + 60000]

In [None]:
# fetch_openml() returns an unsorted dataset
sort_by_target(mnist) 

In [None]:
mnist.data.shape 

In [None]:
type(mnist.data)

In [None]:
# prepare features data and labels 
# X, y = mnist.data, mnist.target
X, y = mnist.data.to_numpy(), mnist.target.to_numpy()
X.shape, y.shape

In [None]:
# MNIST images dimension
28*28

In [None]:
X[36000].shape

In [None]:
# X[36000].reshape(28, 28)

In [None]:
# Check one record
some_digit = X[36000]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap = mpl.cm.binary,
           interpolation="nearest")
plt.axis("off")

In [None]:
# Which number is it? 

In [None]:
# check the corresponding label
y[36000]

In [None]:
# plot some digits to see the data set visualy
def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap = mpl.cm.binary,
               interpolation="nearest")
    plt.axis("off")

In [None]:
# EXTRA
def plot_digits(instances, images_per_row=10, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.imshow(image, cmap = mpl.cm.binary, **options)
    plt.axis("off")

In [None]:
# Plot all figures 
plt.figure(figsize=(9,9))
example_images = np.r_[X[:12000:600], X[13000:30600:600], X[30600:60000:590]]
plot_digits(example_images, images_per_row=10)
# save_fig("more_digits_plot")
# plt.show()

In [None]:
# Split train and test data sets of X and y
import numpy as np

# thr = 60000
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

shuffle_index = np.random.permutation(60000)
print(shuffle_index)

X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

# Binary classifier

We Start classification exercize with binary classifier. 
Task: develop model that detects only images with number 5 

In [None]:
# Creat a mask for number 5 
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

In [None]:
y_test_5

In [None]:
# Use simple Linear model classifier 
# Train it with Stochastic Gradient Descent (SGD) 
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=6, tol=-np.infty, random_state=42)
sgd_clf.fit(X_train, y_train_5)

In [None]:
# Check prediction on sample 
sgd_clf.predict([some_digit])

In [None]:
# correct? 

In [None]:
# Cross-validate the model with accuracy scoring, e.g. cv=3 
# sklearn.model_selection.cross_val_score(estimator, X, y=None, scoring=None, cv=None, 
# n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs')

from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")

# score(X, y[, sample_weight]) .. return the mean accuracy on the given data and labels. 

# Update example! 
# For int/None inputs, if the estimator is a classifier and y is either binary or multiclass, StratifiedKFold is used. 
# In all other cases, KFold is used.

How good is the model? 

In [None]:
# Run stratified sampling at each run 
# Write the process explicitly 

from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

# set number of splits (folds), e.g. 3
n_splits = 3
skfolds = StratifiedKFold(n_splits=n_splits)

sgd_clf_ = SGDClassifier(max_iter=6, tol=-np.infty, random_state=42)

for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf_)
    X_train_folds = X_train[train_index]
    y_train_folds = (y_train_5[train_index])
    X_test_fold = X_train[test_index]
    y_test_fold = (y_train_5[test_index])

    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    # accuracy assesment
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))

In [None]:
# What is the model performance? 

#### Think of the classification problem set up! 

In [None]:
# Create a dummy classifier that returns always zero! 
# Why? 

from sklearn.base import BaseEstimator

class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)

In [None]:
Never5Classifier()

In [None]:
never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")

## Why?

In [None]:
# ... 

In [None]:
# Create a dummy classifier that returns always one! 
# Why? 
from sklearn.base import BaseEstimator
class Always5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.ones((len(X), 1), dtype=bool)

In [None]:
always_5_clf = Always5Classifier()
cross_val_score(always_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")

### What is the accuracy? How good is the model? 

### Confusion Matrix

In [None]:
# Run model prediction with cross-validation (e.g. cv=3)
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)

In [None]:
# print confusion matrix 

from sklearn.metrics import confusion_matrix

confusion_matrix(y_train_5, y_train_pred)

In [None]:
# How good is the model? 

### Precision & Recall 

In [None]:
# Use sklearn precision and recall methods 
from sklearn.metrics import precision_score, recall_score

precision_score(y_train_5, y_train_pred)

In [None]:
# Check 
# Precision = TP / (TP + FP)

In [None]:
recall_score(y_train_5, y_train_pred)

In [None]:
# Check 
# Recall = TP / (TP + FN) 

### F1 score

In [None]:
from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred)

In [None]:
# Check 
# F1 = TP / (TP + ((FN + FN) / 2)

### Precision & Recall Tradeoff
Plot the precision - recall function for selected 'some_digit'

In [None]:
# Run cross validation with method="decision_function"  to get scores 
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,
                             method="decision_function")
# Predict confidence scores for samples.

In [None]:
y_scores[36000]


In [None]:
# Prepare precision / recall scores for the plot 
# Compute precision-recall pairs for different probability thresholds.
# Note: this implementation is restricted to the binary classification task.

# sklearn.metrics.precision_recall_curve(y_true, probas_pred, *, pos_label=None, sample_weight=None)

from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

In [None]:
# thresholds .. increasing thresholds on the decision function used to compute precision and recall. 

In [None]:
print(np.min(thresholds))
print(np.max(thresholds))

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.xlabel("Threshold", fontsize=16)
    plt.legend(loc="upper left", fontsize=16)
    plt.ylim([0, 1])

plt.figure(figsize=(8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.xlim([-700000, 700000])

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.xlabel("Threshold", fontsize=16)
    plt.legend(loc="upper left", fontsize=16)
    plt.ylim([0, 1])

plt.figure(figsize=(8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.xlim([-150000, 150000])

In [None]:
# Goal: model with 90% precision 
# thr ~ 150 000
y_train_pred_90 = (y_scores > 150000)

In [None]:
precision_score(y_train_5, y_train_pred_90)

In [None]:
recall_score(y_train_5, y_train_pred_90)

In [None]:
# Find a point where precision equals the recall and set the Threshold 
# and re-compute performance measures 

eq = np.where(np.abs(precisions - recalls == np.min(np.abs(precisions - recalls))))[0][0]

y_train_pr = (y_scores >= eq) 
print(precision_score(y_train_5, y_train_pr)) 
print(recall_score(y_train_5, y_train_pr)) 

# ROC curves

In [None]:
# Plot ROC and compare Linear - SGD model with Random Forest 

from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
# save_fig("roc_curve_plot")
# plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_train_5, y_scores)

### RandomForestClassifier

**Note**: we set `n_estimators=10` to avoid a warning about the fact that its default value will be set to 100.

In [None]:
# takes while 
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3,
                                    method="predict_proba")

In [None]:
# score = probability of positive class
y_scores_forest = y_probas_forest[:, 1]  
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_scores_forest)


In [None]:
# Compare Linear (SGD)  model with RF 
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, "b:", linewidth=2, label="Linear-SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="lower right", fontsize=16)


In [None]:
roc_auc_score(y_train_5, y_scores_forest)

In [None]:
# RandomForest: F1, Precision & Recall 
# Task: 
pass

-------------------------------------------