# 第7回課題
2段階の train, valid, test にわける交差検証の自前の実装
（パッケージにしたり関数にしたりするのは自由です．解答はしていません）

https://scikit-learn.org/stable/modules/cross_validation.html　で様々なパターンについて述べられています

# 注意事項
train, valid, test で取ってくるサンプル（id）が一致しないということを必ず確認してください

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import random
from sklearn.datasets import load_wine
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()

In [2]:
data = load_wine()

In [3]:
wine_df = pd.DataFrame(min_max_scaler.fit_transform(data["data"]), columns=data["feature_names"])
wine_df = pd.concat([wine_df, pd.DataFrame(data["target"], columns=["target"])], axis=1)
print(wine_df.shape)
wine_df.head()

(178, 14)


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,0.842105,0.1917,0.572193,0.257732,0.619565,0.627586,0.57384,0.283019,0.59306,0.372014,0.455285,0.970696,0.561341,0
1,0.571053,0.205534,0.417112,0.030928,0.326087,0.575862,0.510549,0.245283,0.274448,0.264505,0.463415,0.78022,0.550642,0
2,0.560526,0.320158,0.700535,0.412371,0.336957,0.627586,0.611814,0.320755,0.757098,0.375427,0.447154,0.695971,0.646933,0
3,0.878947,0.23913,0.609626,0.319588,0.467391,0.989655,0.664557,0.207547,0.55836,0.556314,0.308943,0.798535,0.857347,0
4,0.581579,0.365613,0.807487,0.536082,0.521739,0.627586,0.495781,0.490566,0.444795,0.259386,0.455285,0.608059,0.325963,0


# モデル（今回は決定木を利用）

In [4]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(random_state=0)

# seed の設定

In [5]:
seed = 0
random.seed(seed)
np.random.seed(seed)

# k-fold の k は変数で扱うこと

In [6]:
k = 5

# train (valid 含む), test の分割

In [7]:
def split_idx(data_len, k):
    base_size = data_len // k
    rest_size = data_len % k
    split_size = [base_size * i + i if rest_size >= i else base_size * i + rest_size for i in range(0, k+1)]
    split_idx = [np.arange(split_size[i], split_size[i+1]) for i in range(k)]
    return split_idx

In [8]:
# test index list
split_idx(wine_df.shape[0], k)

[array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35]),
 array([36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
        53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
        70, 71]),
 array([ 72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
         85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
         98,  99, 100, 101, 102, 103, 104, 105, 106, 107]),
 array([108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
        121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133,
        134, 135, 136, 137, 138, 139, 140, 141, 142]),
 array([143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
        169, 170, 171, 172, 173, 174, 175, 176, 177])]

# 交差検証
train, valid を分割しつつ valid をテスト

In [9]:
from sklearn.metrics import accuracy_score # 回答はaccuracy だけにしています．ここも特にしていないです

In [10]:
# k回訓練し，valid で評価
def cv_valid(x, y, k):
    idx = split_idx(x.shape[0], k)
    data_idx = np.arange(x.shape[0])
    # data shuffle
    np.random.shuffle(data_idx)
    x = x[data_idx]
    y = y[data_idx]

    def cv_map_lapper(valid_idx):
        train_idx = np.setdiff1d(data_idx, valid_idx)
        x_train, y_train = x[train_idx], y[train_idx]
        x_valid, y_valid = x[valid_idx], y[valid_idx]
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_valid)
        return accuracy_score(y_valid, y_pred)
    
    acc = list(map(cv_map_lapper, idx))
    return acc

In [11]:
# train すべてで訓練し，test を評価
def cv_test(x, y, k):
    idx = split_idx(x.shape[0], k)
    data_idx = np.arange(x.shape[0])

    def cv_map_lapper(test_idx):
        train_idx = np.setdiff1d(data_idx, test_idx)
        x_train, y_train = x[train_idx], y[train_idx]
        x_test, y_test = x[test_idx], y[test_idx]
        # validation
        valid_acc = cv_valid(x_train, y_train, k)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        return accuracy_score(y_test, y_pred), valid_acc
    
    acc = list(map(cv_map_lapper, idx))
    return acc
acc = cv_test(wine_df.iloc[:, :13].values, wine_df.iloc[:, 13].values, k)

# 結果を表示

In [12]:
from tabulate import tabulate

In [13]:
valid_score = np.array([acc[i][1] for i in range(k)]).T
test_score = np.array([acc[i][0] for i in range(k)])
score = np.concatenate([valid_score, np.mean(valid_score, axis=0).reshape(1, -1), test_score.reshape(1, -1)])
score = np.concatenate([score, np.mean(score, axis=1).reshape(-1, 1)], axis=1)

In [14]:
headers = ['Name\\Score']
headers.extend(['CV_{0}'.format(i+1) for i in range(k)])
headers.append('CV_Ave')
showindex = ['Valid_{0}'.format(i+1) for i in range(k)]
showindex.extend(['Valid_Ave', 'Test'])
print(tabulate(score, headers=headers, showindex=showindex))

Name\Score        CV_1      CV_2      CV_3      CV_4      CV_5    CV_Ave
------------  --------  --------  --------  --------  --------  --------
Valid_1       0.896552  0.862069  0.965517  1         0.896552  0.924138
Valid_2       0.896552  0.965517  0.965517  0.896552  0.862069  0.917241
Valid_3       0.964286  1         0.928571  0.862069  0.965517  0.944089
Valid_4       0.928571  0.892857  1         0.964286  1         0.957143
Valid_5       0.857143  0.928571  0.964286  0.892857  0.892857  0.907143
Valid_Ave     0.908621  0.929803  0.964778  0.923153  0.923399  0.929951
Test          0.916667  0.833333  0.805556  0.714286  0.914286  0.836825
