## データビリティコンソーシアム eラーニング講座：AIを体系的に学ぶ
## 知識情報学（第4回）識別(3) サポートベクトルマシン
- MIT Lisence

## SVMによるBreast Cancerデータの識別とハイパーパラメータ自動最適化
- Optunaで最適パラメータを探索
 - https://optuna.org
 - 大まかな使い方は上記のCoce Exampleを参照
 - （注）Google colabにはデフォルトで入っていないため，別途インストールが必要

In [1]:
# optunaのインストール（コメントアウト#を外して実行）
! pip install optuna

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.3-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.2/233.2 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Downloading Mako-1.3.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [2]:
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import scale
import optuna

### Breast Cancerデータのロード

In [3]:
bc = load_breast_cancer()
X_std = scale(bc.data)
y = bc.target

# データセット詳細
print(bc.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

:Number of Instances: 569

:Number of Attributes: 30 numeric, predictive attributes and the class

:Attribute Information:
    - radius (mean of distances from center to points on the perimeter)
    - texture (standard deviation of gray-scale values)
    - perimeter
    - area
    - smoothness (local variation in radius lengths)
    - compactness (perimeter^2 / area - 1.0)
    - concavity (severity of concave portions of the contour)
    - concave points (number of concave portions of the contour)
    - symmetry
    - fractal dimension ("coastline approximation" - 1)

    The mean, standard error, and "worst" or largest (mean of the three
    worst/largest values) of these features were computed for each image,
    resulting in 30 features.  For instance, field 0 is Mean Radius, field
    10 is Radius SE, field 20 is Worst Radius.

    - 

### 学習データとテストデータに分割

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.2, random_state = 1)

### Optunaでハイパーパラメータを最適化
- Optunaの使い方は，下記Code Exampleのscikit-learnを参照
    - https://optuna.org/#code_examples
- Optuna サンプリング関数
    - https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html
- SVCのパラメータリストは下記を参照
    - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
    
### 【オンライン演習】 最適化する目的関数の定義
- rbfカーネルを使用し，クロスバリデーションによるテストデータに対する平均Accuracyを返すように設計する

In [5]:
def objective(trial):
    # γパラメータを対数一様分布からサンプリング
    param_gamma = trial.suggest_loguniform('param_gamma', 1e-3, 1e3)

    # 上でサンプリングされたハイパーパラメータを使用してSVMのインスタンス(rbfカーネル)を生成
    svm = SVC(kernel='rbf', gamma=param_gamma)

    # cross_val_score()を使用して学習データに対してクロスバリデーション
    score = cross_val_score(svm, X_train, y_train, cv=10)
    return score.mean()

In [6]:
# optunaのインスタンスを生成し，最適化を実行
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

[I 2024-10-28 07:17:26,608] A new study created in memory with name: no-name-fee07fa8-feb6-434b-bb4b-39fa62fcbb0f
  param_gamma = trial.suggest_loguniform('param_gamma', 1e-3, 1e3)
[I 2024-10-28 07:17:26,868] Trial 0 finished with value: 0.9560869565217391 and parameters: {'param_gamma': 0.002971854704700809}. Best is trial 0 with value: 0.9560869565217391.
  param_gamma = trial.suggest_loguniform('param_gamma', 1e-3, 1e3)
[I 2024-10-28 07:17:27,297] Trial 1 finished with value: 0.6263285024154589 and parameters: {'param_gamma': 1.7144195981317687}. Best is trial 0 with value: 0.9560869565217391.
  param_gamma = trial.suggest_loguniform('param_gamma', 1e-3, 1e3)
[I 2024-10-28 07:17:27,597] Trial 2 finished with value: 0.6263285024154589 and parameters: {'param_gamma': 606.7976558548384}. Best is trial 0 with value: 0.9560869565217391.
  param_gamma = trial.suggest_loguniform('param_gamma', 1e-3, 1e3)
[I 2024-10-28 07:17:27,686] Trial 3 finished with value: 0.9627053140096619 and parame

### 【提出課題】 Bestパラメータで再学習し，学習データおよびテストデータに対するAccuracyを算出

In [7]:
# optunaで最適化したBestパラメータで再学習．Bestパラメータは，study.best_params['パラメータ名']で参照可能
svm_best = SVC(kernel='rbf', gamma=study.best_params['param_gamma'])

svm_best.fit(X_train, y_train)
trn_acc = svm_best.score(X_train, y_train)
tst_acc = svm_best.score(X_test, y_test)

print('Training data: %1.3f' % trn_acc)
print('Test data: %1.3f' % tst_acc)

Training data: 0.987
Test data: 0.965


### Best パラメータを表示

In [8]:
print(study.best_params)

{'param_gamma': 0.025050789400535643}


###【提出課題2（発展）】objective関数にrbfカーネル以外のカーネル関数も探索に追加し，カーネル関数も含めて最適化するように拡張しなさい．

In [9]:
def objective_new(trial):
    param_kernel = trial.suggest_categorical('param_kernel', ['rbf', 'poly'])
    if(param_kernel=='rbf'):
        param_gamma_rbf = trial.suggest_loguniform('param_gamma_rbf', 1e-3, 1e3)
        svm = SVC(kernel='rbf', gamma=param_gamma_rbf)
    else:
        param_gamma_poly = trial.suggest_loguniform('param_gamma_poly', 1e-3, 1e3)
        param_degree = trial.suggest_int('param_degree', 1, 4)
        param_coef0 =  trial.suggest_int('param_coef0', 0, 1e3)
        svm = SVC(kernel='poly', gamma=param_gamma_poly,
              degree=param_degree, coef0=param_coef0)

    score = cross_val_score(svm, X_train, y_train, cv=10)
    return score.mean()

In [12]:
study_new = optuna.create_study(direction='maximize')
study_new.optimize(objective_new, n_trials=100)

[I 2024-10-28 07:27:10,089] A new study created in memory with name: no-name-a8cfef2c-2292-40a7-9673-7b1cc2432938
  param_gamma_rbf = trial.suggest_loguniform('param_gamma_rbf', 1e-3, 1e3)
[I 2024-10-28 07:27:10,379] Trial 0 finished with value: 0.8018840579710146 and parameters: {'param_kernel': 'rbf', 'param_gamma_rbf': 0.4943315354500663}. Best is trial 0 with value: 0.8018840579710146.
  param_gamma_poly = trial.suggest_loguniform('param_gamma_poly', 1e-3, 1e3)
[I 2024-10-28 07:27:10,697] Trial 1 finished with value: 0.9539613526570049 and parameters: {'param_kernel': 'poly', 'param_gamma_poly': 0.002497759400162177, 'param_degree': 4, 'param_coef0': 113}. Best is trial 1 with value: 0.9539613526570049.
  param_gamma_poly = trial.suggest_loguniform('param_gamma_poly', 1e-3, 1e3)
[I 2024-10-28 07:27:10,790] Trial 2 finished with value: 0.9627053140096619 and parameters: {'param_kernel': 'poly', 'param_gamma_poly': 0.42404018490938683, 'param_degree': 4, 'param_coef0': 177}. Best is 

In [19]:
# 最適化されたパラメータでSVMを学習
if study_new.best_params['param_kernel'] == 'rbf':
    svm_best = SVC(kernel='rbf', gamma=study_new.best_params['param_gamma_rbf'])
else:
    svm_best = SVC(kernel='poly', gamma=study_new.best_params['param_gamma_poly'],
                  degree=study_new.best_params['param_degree'], coef0=study_new.best_params['param_coef0'])

svm_best.fit(X_train, y_train)

# 学習データとテストデータに対するAccuracyを算出
trn_acc = svm_best.score(X_train, y_train)
tst_acc = svm_best.score(X_test, y_test)

print('Training data: %1.3f' % trn_acc)
print('Test data: %1.3f' % tst_acc)
print('Best params:')
print(study_new.best_params)

Training data: 0.991
Test data: 0.965
Best params:
{'param_kernel': 'poly', 'param_gamma_poly': 0.001539779480434535, 'param_degree': 2, 'param_coef0': 560}
