# Applied Machine Learning: Module 3 (evaluation)

## Evaluation for classification

### Preamble

In [3]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits

dataset = load_digits()

X, y = dataset.data, dataset.target

for class_name, class_count in zip(dataset.target_names, np.bincount(dataset.target)):
    print(class_name, class_count)

0 178
1 182
2 177
3 183
4 181
5 182
6 181
7 179
8 174
9 180


In [6]:
# Creating a dataset with imbalanced binary classes
# Negatice class (0) is 'not digit 1'
# Positive class (1) is 'digit 1'

y_binary_imbalanced = y.copy()
y_binary_imbalanced[y_binary_imbalanced != 1] = 0

print('Original labels :\t', y[1:30])
print('New binary labels:\t', y_binary_imbalanced[1:30])

Original labels :	 [1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]
New binary labels:	 [1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]


In [7]:
np.bincount(y_binary_imbalanced)

array([1615,  182])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)

from sklearn.svm import SVC

clf = SVC(kernel='rbf', C=1)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9088888888888889

### Dummy classifiers 

DummyClassifier is a classifier that make predictions using simple rules, which can be useful as a baseline for comparaison against actual classifiers especially with imbalanced classes 

In [17]:
from sklearn.dummy import DummyClassifier

# Negative class (0) is the most frequent
dummy_majority = DummyClassifier(strategy='most_frequent')
dummy_majority.fit(X_train, y_train)

y_dummy_prediction = dummy_majority.predict(X_train)
y_dummy_prediction

array([0, 0, 0, ..., 0, 0, 0])

In [19]:
dummy_majority.score(X_test, y_test)

0.9044444444444445

In [20]:
svm = SVC(kernel='linear', C=1)
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

0.9777777777777777