# データサイエンティスト養成読本 演習ノート
演習をJupyter Notebook上で実施したノート。
※個人的な演習です。

事前に、[Bank Marketing](https://archive.ics.uci.edu/ml/datasets/Bank+Marketing)データセットをダウンロードし、bankフォルダ内にノートをコピーして実施。

### Pythonを用いた実行例

In [2]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split

In [18]:
bank = pd.read_csv("bank-full.csv", sep=";")
bank.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,no
8,58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,-1,0,unknown,no
9,43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,-1,0,unknown,no


In [6]:
def make_feature(x):
    # 数値変数のスケーリング
    cn_num = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
    x_num = x[cn_num]
    x[cn_num] = (x_num - x_num.mean()) / x_num.std()
    # ダミー変数への変換
    x_dummies = pd.get_dummies(x)
    return x_dummies

In [7]:
features, label = make_feature(bank.drop('y', 1)), bank.y

In [8]:
# 訓練データとテストデータの作成
random_state = np.random.RandomState(123)
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=.3, random_state=random_state)

In [9]:
# 特徴量の確認
features[:3]

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,1.606947,0.256416,-1.298462,0.011016,-0.569344,-0.411449,-0.251938,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.288526,-0.43789,-1.298462,-0.416122,-0.569344,-0.411449,-0.251938,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.747376,-0.446758,-1.298462,-0.707353,-0.569344,-0.411449,-0.251938,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
# 正解ラベルの確認
label[:3]

0    no
1    no
2    no
Name: y, dtype: object

#### RBFカーネルのSVMによる予測

In [11]:
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import classification_report
# RBFカーネルの予測モデル構築
clf = svm.SVC()
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
# クラスラベルの予測
pred = clf.predict(X_test)
# クラスごとのPrecision, Recall, F-値, Accuracyの算出
print(metrics.classification_report(y_test, pred, target_names=['no', 'yes']))

             precision    recall  f1-score   support

         no       0.91      0.98      0.94     11998
        yes       0.65      0.28      0.39      1566

avg / total       0.88      0.90      0.88     13564



#### ランダムフォレストによる予測

In [14]:
from sklearn import ensemble
# ランダムフォレストによる予測モデル構築
clf2 = ensemble.RandomForestClassifier(n_estimators=500, random_state=random_state)
clf2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False,
            random_state=<mtrand.RandomState object at 0x11a9723c0>,
            verbose=0, warm_start=False)

In [15]:
# クラスラベルの予測
pred = clf2.predict(X_test)
# クラスごとのPrecision, Recall, F-値, Accuracyの算出
print(metrics.classification_report(y_test, pred, target_names=['no', 'yes']))

             precision    recall  f1-score   support

         no       0.92      0.97      0.95     11998
        yes       0.67      0.39      0.49      1566

avg / total       0.89      0.91      0.90     13564



In [17]:
from sklearn import cross_validation as cv
from sklearn import preprocessing
# 層別k分割
skf = cv.StratifiedKFold(y_train, 10)
# 各分割における訓練データ、テストデータの行番号表示
for train, test in skf:
    print("%s %s" % (train, test))

[ 3084  3098  3120 ..., 31644 31645 31646] [   0    1    2 ..., 3177 3178 3180]
[    0     1     2 ..., 31644 31645 31646] [3084 3098 3120 ..., 6367 6368 6369]
[    0     1     2 ..., 31644 31645 31646] [6071 6075 6076 ..., 9539 9540 9541]
[    0     1     2 ..., 31644 31645 31646] [ 9168  9173  9187 ..., 12759 12760 12761]
[    0     1     2 ..., 31644 31645 31646] [11997 11998 12004 ..., 15895 15896 15897]
[    0     1     2 ..., 31644 31645 31646] [15222 15231 15238 ..., 19059 19062 19063]
[    0     1     2 ..., 31644 31645 31646] [18552 18585 18587 ..., 22204 22205 22206]
[    0     1     2 ..., 31644 31645 31646] [21659 21662 21671 ..., 25345 25346 25347]
[    0     1     2 ..., 31644 31645 31646] [25106 25138 25143 ..., 28502 28503 28504]
[    0     1     2 ..., 28502 28503 28504] [28269 28282 28288 ..., 31644 31645 31646]


In [19]:
# RBFのSVM
clf3 = svm.SVC()
lb = preprocessing.LabelBinarizer()
y_train_bin = lb.fit_transform(y_train).ravel()
y_train_bin[:10]

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [20]:
# クロスバリデーションによる評価指標
cv.cross_val_score(clf3, X_train, y_train_bin, cv=skf, scoring='f1')

array([ 0.36329588,  0.38420108,  0.36111111,  0.38185255,  0.40740741,
        0.38931298,  0.39344262,  0.41666667,  0.38888889,  0.38356164])

In [21]:
# (上記、感覚で10倍近く学習に時間を要した)
# クロスバリデーションによる予測結果
pred = cv.cross_val_predict(clf3, X_train, y_train, cv=skf)
print(metrics.classification_report(y_train, pred, target_names=['no', 'yes']))

             precision    recall  f1-score   support

         no       0.91      0.98      0.94     27924
        yes       0.65      0.28      0.39      3723

avg / total       0.88      0.90      0.88     31647



#### Grid Searchでハイパーパラメーターを探索

In [23]:
from sklearn import grid_search as gs
# 探索範囲
param_grid = [
    {'C': [0.5, 1], 'gamma': [0.05, 0.1]}
]
# 各ハイパーパラメーターへのクロスバリデーションの実行
svc = svm.SVC()
clf4 = gs.GridSearchCV(svc, param_grid, cv=10)
clf4.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'C': [0.5, 1], 'gamma': [0.05, 0.1]}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [25]:
# (上記相当時間がかかる、30分程度@MBP4コア)
# 予測結果の評価
pred = clf4.predict(X_test)
# クラスごとのPrecision, Recall, F-値, Accuracyの算出
print(metrics.classification_report(y_test, pred, target_names=['no', 'yes']))

             precision    recall  f1-score   support

         no       0.92      0.98      0.95     11998
        yes       0.66      0.35      0.46      1566

avg / total       0.89      0.90      0.89     13564



#### オリジナルの演習を超えて: ランダムフォレストをグリッドサーチする場合

[GridSearchCVでランダムフォレストの予測器作成に関する質問](https://stackoverflow.com/questions/30102973/how-to-get-best-estimator-on-gridsearchcv-random-forest-classifier-scikit)を参考に実施。

In [None]:
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = { 
    'n_estimators': [200, 1000],
    'max_features': ['auto', 'sqrt', 'log2']
}

# 上で作ったこちらを参考に
# clf2 = ensemble.RandomForestClassifier(n_estimators=500, random_state=random_state)
clf2_1 = ensemble.RandomForestClassifier(random_state=random_state)
clf2_2 = GridSearchCV(estimator=clf2_1, param_grid=param_grid, cv=5)
clf2_2.fit(X_train, y_train)

In [29]:
# 予測結果の評価
pred = clf2_2.predict(X_test)
# クラスごとのPrecision, Recall, F-値, Accuracyの算出
print(metrics.classification_report(y_test, pred, target_names=['no', 'yes']))

             precision    recall  f1-score   support

         no       0.92      0.97      0.95     11998
        yes       0.66      0.38      0.49      1566

avg / total       0.89      0.91      0.89     13564



In [30]:
# 最適パラメーターを表示
clf2_2.best_params_

{'max_features': 'auto', 'n_estimators': 700}