# Chapter 3 Clasification (p.83)
## Preparation : figure relations 

In [2]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

## Preparation : Load MNIST dataset (p.83)

In [3]:
### pip install keras (事前にインストールしておく)
### pip install tensorflow (こちらも要るかも？)

In [4]:
from keras.datasets import mnist

Using TensorFlow backend.


In [5]:
mnist

<module 'keras.datasets.mnist' from 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\datasets\\mnist.py'>

In [6]:
(X_train,y_train),(X_test,y_test)=mnist.load_data()
X_train=X_train.reshape(60000,784)/255
X_test=X_test.reshape(10000,784)/255

In [7]:
#some_digit = X[36000] in book
idx=36000
size=28
a,b=np.meshgrid(range(size),range(size))
some_digit_image=X_train[idx].reshape(size,size)
some_digit_image=some_digit_image[::-1,:]
#print('number:{}'.format(y_train[idx]))



In [8]:
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

In [9]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False)

In [10]:
sgd_clf.predict([X_train[36000]])

array([False], dtype=bool)

In [11]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)

# Performance measure(p.88)
## Precision and Recall  (p.91)

機械学習でよく使われる評価尺度の種類(分類タスク)

1. AUC
2. Accuracy
3. F-measure
4. Specificity
5. Recall
6. Precision

http://tkdmah.hatenablog.com/entry/2014/02/22/193008

- モデル評価精度は、AUCがよく使われる
- Accuracy=正例・負例を平等に着目した指標
- Precision, Recall, F-measure=正例に着目した指標

$$
\begin{array}{|c||c|c|}
N=60,000 & Predicted:NO & Predicted:YES \\\hline 
Actual:NO & 53943[True~Positives(TP)] & 636[False~Negatives(FN)] \\\
Actual:YES & 1137[False~Positives(FP)] & 4284[True~Negatives(TN)]
\end{array}
$$

$$
\begin{eqnarray}
&& precision  =  \frac{TP}{TP + FP} \\
&& recall( sensitivity, True  Positive  Rate) = \frac{TP}{ TP + FN }\\
&& F-measure = \frac{2*precision*recall}{precision+recall}
\end{eqnarray}
$$

In [18]:
from sklearn.metrics import precision_score,recall_score

In [21]:
precision_score(y_train_5, y_train_pred)

0.88860875544350215

In [22]:
recall_score(y_train_5, y_train_pred)

0.71518170079321164

F-measure(F値)は、PrecisionとRecallの調和平均。

3評価尺度の事例
https://siguniang.wordpress.com/2016/01/10/notes-on-information-retrieval-precision-recall-f1-score/

In [23]:
from sklearn.metrics import f1_score

In [25]:
f1_score(y_train_5, y_train_pred)

0.79251839738348329

# Precision/Recall tradeoff

![Figure3-3](./F3-3.png) 

In [65]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3, random_state=42)

for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = (y_train_5[train_index])
    X_test_fold = X_train[test_index]
    y_test_fold = (y_train_5[test_index])

    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))

0.9701
0.97
0.97125


In [18]:
from sklearn.metrics import f1_score
f1_score(y_train_5, y_pred)

NameError: name 'y_pred' is not defined

5でないクラス(1-4,6-9)の精度を求める。

In [68]:
from sklearn.base import BaseEstimator
class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)

In [67]:
never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")

array([ 0.909  ,  0.90745,  0.9125 ])

(deep copy=全部作る, shallow copy=一部作る)

In [78]:
help(clone)

Help on function clone in module sklearn.base:

clone(estimator, safe=True)
    Constructs a new estimator with the same parameters.
    
    Clone does a deep copy of the model in an estimator
    without actually copying attached data. It yields a new estimator
    with the same parameters that has not been fit on any data.
    
    Parameters
    ----------
    estimator: estimator object, or list, tuple or set of objects
        The estimator or group of estimators to be cloned
    
    safe: boolean, optional
        If safe is false, clone will fall back to a deepcopy on objects
        that are not estimators.



In [4]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)

NameError: name 'sgd_clf' is not defined

In [5]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train_5, y_train_pred)

NameError: name 'y_train_5' is not defined

In [71]:
y_train_perfect_predictions = y_train_5

In [72]:
confusion_matrix(y_train_5, y_train_perfect_predictions)

array([[54579,     0],
       [    0,  5421]])

confusion matrixの参考サイト
https://rasbt.github.io/mlxtend/user_guide/evaluate/confusion_matrix/
http://www.baru-san.net/archives/141


http://tkdmah.hatenablog.com/entry/2014/02/22/193008