githubからデータを取得

In [None]:
!git clone https://github.com/Apress/mastering-ml-w-python-in-six-steps.git
!ln -s mastering-ml-w-python-in-six-steps/Chapter_4_Code/Code/Data Data

Warinigの非表示

In [None]:
import warnings
warnings.filterwarnings('ignore')

4-1.  データの読み込みとクラス分布の確認

In [None]:
import pandas as pd
import pylab as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# データの読み込み
df = pd.read_csv("Data/Diabetes.csv")
print (df['class'].value_counts(normalize=True))

 4-2. ロジスティック回帰モデルの構築と性能の評価

In [None]:
X = df.iloc[:,:8]     # 独立変数
y = df['class']     # 従属変数

# 訓練データセットとテストデータセットに分けてモデルを評価
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# ロジスティック回帰モデルをインスタンス化して適合
model = LogisticRegression(max_iter=150)
model = model.fit(X_train, y_train)

# 訓練セットのクラスラベルを予測する。predict関数は確率が0.5より大きい値を1 か 0に変換する
y_pred = model.predict(X_train)

# クラス確率の生成
# probs配列には2つの要素が返されることに注意
# 1番目の要素は負のクラスの確率
# 2番目の要素は正のクラスの確率
probs = model.predict_proba(X_train)
y_pred_prob = probs[:, 1]

# 評価指標の生成
print ("Accuracy: ", metrics.accuracy_score(y_train, y_pred))


 4-3.  最適なカットオフポイントを見つける

In [None]:
# 偽陽性、真陽性率の抽出
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_pred_prob)
roc_auc = metrics.auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc)
i = np.arange(len(tpr)) # Dataframeのインデックス
roc = pd.DataFrame({'fpr' : pd.Series(fpr, index=i),'tpr' : pd.Series
(tpr, index = i),'1-fpr' : pd.Series(1-fpr, index = i), 'tf' : pd.Series
(tpr - (1-fpr), index = i),'thresholds' : pd.Series(thresholds, 
index = i)})
roc.iloc[(roc.tf-0).abs().argsort()[:1]]

# tprと1-fprプロットして比較
fig, ax = plt.subplots()
plt.plot(roc['tpr'], label='tpr')
plt.plot(roc['1-fpr'], color = 'red', label='1-fpr')
plt.legend(loc='best')
plt.xlabel('1-False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.show()


 4-4. 最適な確率カットオフを見つけるための関数例


In [None]:
def Find_Optimal_Cutoff(target, predicted):
    """
    イベントレートに関連する分類モデルの最適な確率カットオフポイントを見つけるためのパラメータ
    ----------
    target: 行が観測値。従属データまたは目的データを持つ行列
    predicted : 予測されたデータを持つ行列
    返り値
    最適なカットオフ値を持つリスト
    """
    fpr, tpr, threshold = metrics.roc_curve(target, predicted)
    i = np.arange(len(tpr))
    
    roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
    roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]
    return list(roc_t['threshold']) 

# 最適な確率の閾値を見つける
# Note: probs[:, 1] は正のラベルである確率を持つ
threshold = Find_Optimal_Cutoff(y_train, probs[:, 1])
print ("Optimal Probability Threshold: ", threshold)

# 予測確率に閾値を適用する
y_pred_optimal = np.where(y_pred_prob >= threshold, 1, 0)

# 通常のアプローチと最適なカットオフの精度を比較する
print ("\nNormal - Accuracy: ", metrics.accuracy_score(y_train, y_pred))
print ("Optimal Cutoff - Accuracy: ", metrics.accuracy_score(y_train, y_pred_optimal))
print ("\nNormal - Confusion Matrix: \n", metrics.confusion_matrix(y_train, y_pred))
print ("Optimal - Cutoff Confusion Matrix: \n", metrics.confusion_matrix(y_train, y_pred_optimal))


4-5. レアケースと不均衡データセットの取り扱い

In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification

# 単純化のために2つの特徴でデータセットを生成する
X, y = make_classification(n_samples=5000, n_features=2, n_informative=2,
                           
n_redundant=0, weights=[0.9, 0.1], random_state=2017)
print ("Positive class: ", y.tolist().count(1))
print ("Negative class: ", y.tolist().count(0))


In [None]:
pip install imbalanced-learn

In [None]:
#  Imbalanced-learnをpip install しておく
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

# ランダムアンダーサンプリング
rus = RandomUnderSampler()
X_RUS, y_RUS = rus.fit_resample(X, y)

# ランダムオーバーサンプリング
ros = RandomOverSampler()
X_ROS, y_ROS = ros.fit_resample(X, y)

# SMOTE
sm = SMOTE()
X_SMOTE, y_SMOTE = sm.fit_resample(X, y)

# 元データとリサンプリング結果を比較
plt.figure(figsize=(10, 6))
plt.subplot(2,2,1)
plt.scatter(X[y==0,0], X[y==0,1], marker='o', color='blue')
plt.scatter(X[y==1,0], X[y==1,1], marker='+', color='red')
plt.xlabel('x1')
plt.ylabel('x2')
plt.title('Original: 1=%s and 0=%s' %(y.tolist().count(1), y.tolist().count(0)))
plt.subplot(2,2,2)
plt.scatter(X_RUS[y_RUS==0,0], X_RUS[y_RUS==0,1], marker='o', color='blue')
plt.scatter(X_RUS[y_RUS==1,0], X_RUS[y_RUS==1,1], marker='+', color='red')
plt.xlabel('x1')
plt.ylabel('y2')
plt.title('Random Under-sampling: 1=%s and 0=%s' %(y_RUS.tolist().count(1), y_RUS.tolist().count(0)))
plt.subplot(2,2,3)
plt.scatter(X_ROS[y_ROS==0,0], X_ROS[y_ROS==0,1], marker='o', color='blue')
plt.scatter(X_ROS[y_ROS==1,0], X_ROS[y_ROS==1,1], marker='+', color='red')
plt.xlabel('x1')
plt.ylabel('x2') 
plt.title('Random over-sampling: 1=%s and 0=%s' %(y_ROS.tolist().count(1), y_ROS.tolist().count(0)))
plt.subplot(2,2,4)
plt.scatter(X_SMOTE[y_SMOTE==0,0], X_SMOTE[y_SMOTE==0,1], marker='o', color='blue')
plt.scatter(X_SMOTE[y_SMOTE==1,0], X_SMOTE[y_SMOTE==1,1], marker='+', color='red')
plt.xlabel('x1')
plt.ylabel('y2')
plt.title('SMOTE: 1=%s and 0=%s' %(y_SMOTE.tolist().count(1), y_SMOTE.tolist().count(0)))
plt.tight_layout()
plt.show()


4-6. 様々なリサンプリング手法を用いたモデルの構築とパフォーマンスの評価

In [None]:
from sklearn import tree
from sklearn import metrics
from sklearn.model_selection import train_test_split
X_RUS_train, X_RUS_test, y_RUS_train, y_RUS_test = train_test_split(X_RUS, y_RUS, test_size=0.3, random_state=2017)
X_ROS_train, X_ROS_test, y_ROS_train, y_ROS_test = train_test_split(X_ROS, y_ROS, test_size=0.3, random_state=2017)
X_SMOTE_train, X_SMOTE_test, y_SMOTE_train, y_SMOTE_test = train_test_split(X_SMOTE, y_SMOTE, test_size=0.3, random_state=2017)

# 決定木の構築
clf = tree.DecisionTreeClassifier(random_state=2017)
clf_rus = clf.fit(X_RUS_train, y_RUS_train)
clf_ros = clf.fit(X_ROS_train, y_ROS_train)
clf_smote = clf.fit(X_SMOTE_train, y_SMOTE_train)

# モデルの性能を評価
print ("\nRUS - Train AUC : ",metrics.roc_auc_score(y_RUS_train, clf.predict(X_RUS_train)))
print ("RUS - Test AUC : ",metrics.roc_auc_score(y_RUS_test, clf.predict
(X_RUS_test)))
print ("ROS - Train AUC : ",metrics.roc_auc_score(y_ROS_train, clf.predict(X_ROS_train)))
print ("ROS - Test AUC : ",metrics.roc_auc_score(y_ROS_test, clf.predict(X_ROS_test)))
print ("\nSMOTE - Train AUC : ",metrics.roc_auc_score(y_SMOTE_train, clf.predict(X_SMOTE_train)))
print ("SMOTE - Test AUC : ",metrics.roc_auc_score(y_SMOTE_test, clf.predict(X_SMOTE_test)))


4-7 k分割交差検証

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

# データの読み込み
df = pd.read_csv("Data/Diabetes.csv")
X = df.iloc[:,:8].values     # 独立変数
y = df['class'].values     # 従属変数

# データの正規化
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)

# 学習データセットとテストデータセットに分けてモデルを評価
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2017)

# 決定木を構築
# clf = tree.DecisionTreeClassifier(random_state=2017)
clf = LogisticRegression(random_state=2017)
clf = clf.fit(X_train, y_train)

# 10分割の交差検証でモデルを評価
train_scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=5)
test_scores = cross_val_score(clf, X_test, y_test, scoring='accuracy', cv=5)
print ("Train Fold AUC Scores: ", train_scores)
print ("Train CV AUC Score: ", train_scores.mean())
print ("\nTest Fold AUC Scores: ", test_scores)
print ("Test CV AUC Score: ", test_scores.mean())


4-8.  層化K-分割交差検証

In [None]:
from sklearn import model_selection

kfold = model_selection.StratifiedKFold(n_splits=5)
train_scores = []
test_scores = []
k = 0
for (train, test) in kfold.split(X_train, y_train):
    clf.fit(X_train[train], y_train[train])
    train_score = clf.score(X_train[train], y_train[train])
    train_scores.append(train_score)
    # テストデータでのスコア
    test_score = clf.score(X_train[test], y_train[test])
    test_scores.append(test_score)
    k += 1
    print('Fold: %s, Class dist.: %s, Train Acc: %.3f, Test Acc: %.3f'% (k, np.bincount(y_train[train]), train_score, test_score))
print('\nTrain CV accuracy: %.3f' % (np.mean(train_scores)))
print('Test CV accuracy: %.3f' % (np.mean(test_scores)))


4-9 層化K-分割交差検証のためのROC曲線のプロット

In [None]:
from sklearn.metrics import roc_curve, auc
from itertools import cycle
from numpy import interp as np_interp

kfold = model_selection.StratifiedKFold(n_splits=5)

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)

colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange'])
lw = 2

i = 0

for (train, test), color in zip(kfold.split(X, y), colors):
    probas_ = clf.fit(X[train], y[train]).predict_proba(X[test])
    # ROC曲線の計算
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    mean_tpr += np_interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=lw, color=color,label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
    i += 1

plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck')
mean_tpr /= kfold.get_n_splits(X, y)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate') 
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()


 4-10. 決定木とバギングの比較

In [None]:
# 決定木による分類
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# データの読み込み
df = pd.read_csv("Data/Diabetes.csv")
X = df.iloc[:,:8].values     # 独立変数
y = df['class'].values     # 従属変数

# 正規化
X = StandardScaler().fit_transform(X)

# 学習データセットとテストデータセットに分けてモデルを評価
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2019)
kfold = model_selection.StratifiedKFold(n_splits=5)
num_trees = 100

# 決定木，5回の畳み込みによるクロスバリデーション
clf_DT = DecisionTreeClassifier(random_state=2019).fit(X_train,y_train)
results = model_selection.cross_val_score(clf_DT, X_train,y_train, cv=kfold)
print ("Decision Tree (stand alone) - Train : ", results.mean())
print ("Decision Tree (stand alone) - Test : ", metrics.accuracy_score(clf_DT.predict(X_test), y_test))

# バギングを用いて100個の決定木モデルを構築し，平均/多数決の予測を行う
clf_DT_Bag = BaggingClassifier(base_estimator=clf_DT, n_estimators=num_trees, random_state=2019).fit(X_train,y_train)
results = model_selection.cross_val_score(clf_DT_Bag, X_train, y_train, cv=kfold)
print ("\nDecision Tree (Bagging) - Train : ", results.mean())
print ("Decision Tree (Bagging) - Test : ", metrics.accuracy_score(clf_DT_Bag.predict(X_test), y_test))


4-11.  決定木における特徴量の重要度

In [None]:
feature_importance = clf_DT.feature_importances_

# 重要度の高いものから相対的に重要度を上げる
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, df.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()


 4-12. ランダムフォレスト

In [None]:
from sklearn.ensemble import RandomForestClassifier

num_trees = 100
kfold = model_selection.StratifiedKFold(n_splits=5)
clf_RF = RandomForestClassifier(n_estimators=num_trees).fit(X_train, y_train)
results = model_selection.cross_val_score(clf_RF, X_train, y_train, cv=kfold)
print ("Random Forest (Bagging) - Train : ", results.mean())
print ("Random Forest (Bagging) - Test : ", metrics.accuracy_score(clf_RF.predict(X_test), y_test))


4-13. エクストラツリー

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

num_trees = 100
kfold = model_selection.StratifiedKFold(n_splits=5)
clf_ET = ExtraTreesClassifier(n_estimators=num_trees).fit(X_train, y_train)
results = model_selection.cross_val_score(clf_ET, X_train, y_train, cv=kfold)
print ("ExtraTree - Train : ", results.mean())
print ("ExtraTree - Test : ", metrics.accuracy_score(clf_ET.predict(X_test), y_test))


 4-14. 決定境界のプロット

In [None]:
from sklearn.decomposition import PCA
from matplotlib.colors import ListedColormap

# PCA
X = PCA(n_components=2).fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2017)

kfold = model_selection.StratifiedKFold(n_splits=5)
num_trees = 100

seed = 2019 

# 決定木
clf_DT = DecisionTreeClassifier(random_state=seed).fit(X_train,y_train)
results = model_selection.cross_val_score(clf_DT, X_train,y_train, cv=kfold)
print("Decision Tree (stand alone) - Train : ", results.mean())
print("Decision Tree (stand alone) - Test : ", metrics.accuracy_score(clf_DT.predict(X_test), y_test))

# バギング
clf_DT_Bag = BaggingClassifier(base_estimator=clf_DT, n_estimators=num_trees, random_state=seed).fit(X_train,y_train)
results = model_selection.cross_val_score(clf_DT_Bag, X_train, y_train, cv=kfold)
print("Decision Tree (Bagging) - Train : ", results.mean())
print("Decision Tree (Bagging) - Test : ", metrics.accuracy_score(clf_DT_Bag.predict(X_test), y_test))

# ランダムフォレスト
clf_RF = RandomForestClassifier(n_estimators=num_trees).fit(X_train, y_train)
results = model_selection.cross_val_score(clf_RF, X_train, y_train, cv=kfold)
print("Random Forest - Train : ", results.mean())
print("Random Forest  - Test : ", metrics.accuracy_score(clf_RF.predict(X_test), y_test))

#エクストラツリー
clf_ET = ExtraTreesClassifier(n_estimators=num_trees).fit(X_train, y_train)
results = model_selection.cross_val_score(clf_ET, X_train, y_train, cv=kfold)
print("ExtraTree - Train : ", results.mean())
print("ExtraTree - Test : ", metrics.accuracy_score(clf_ET.predict(X_test), y_test))

def plot_decision_regions(X, y, classifier):
    h = .02  # メッシュサイズの設定
    # マーカーのカラーマップの設定
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])
    # 決定境界のプロット設定
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h), np.arange(x2_min, x2_max, h))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
                    alpha=0.8, c=colors[idx],
                    marker=markers[idx], label=cl)

# 決定境界のプロット
plt.figure(figsize=(10,6))
plt.subplot(221)
plot_decision_regions(X, y, clf_DT)
plt.title('Decision Tree (Stand alone)')
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.subplot(222)
plot_decision_regions(X, y, clf_DT_Bag)
plt.title('Decision Tree (Bagging - 100 trees)')
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.legend(loc='best')
plt.subplot(223)
plot_decision_regions(X, y, clf_RF)
plt.title('RandomForest Tree (100 trees)')
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.legend(loc='best')
plt.subplot(224)
plot_decision_regions(X, y, clf_ET)
plt.title('Extream Random Tree (100 trees)')
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.legend(loc='best')
plt.tight_layout()


4-15. 決定木とAdaBoost

In [None]:
# 決定木による分類
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# データの読み込み
df = pd.read_csv("Data/Diabetes.csv")

# 弱い特徴量を使って決定木を作成する
X = df[['age','serum_insulin']]     # 独立変数
y = df['class'].values              # 従属変数

# 正規化
X = StandardScaler().fit_transform(X)

# 学習データセットとテストデータセットに分けてモデルを評価する
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2019)
kfold = model_selection.StratifiedKFold(n_splits=5)
num_trees = 100

# 5倍のクロスバリデーションを用いた決定木
# より多くの不純物を含む葉を得るためにmax_depthを1に制限する
clf_DT = DecisionTreeClassifier(max_depth=1, random_state=2019).fit(X_train,y_train)
results = model_selection.cross_val_score(clf_DT, X_train,y_train, cv=kfold.split(X_train, y_train))
print("Decision Tree (stand alone) - CV Train : %.2f" % results.mean())
print("Decision Tree (stand alone) - Test : %.2f" % metrics.accuracy_score(clf_DT.predict(X_train), y_train))
print("Decision Tree (stand alone) - Test : %.2f" % metrics.accuracy_score(clf_DT.predict(X_test), y_test))

# Adaptive Boostingを100回繰り返す
clf_DT_Boost = AdaBoostClassifier(base_estimator=clf_DT, n_estimators=num_trees, learning_rate=0.1, random_state=2019).fit(X_train,y_train)
results = model_selection.cross_val_score(clf_DT_Boost, X_train, y_train, cv=kfold.split(X_train, y_train))
print("\nDecision Tree (AdaBoosting) - CV Train : %.2f" % results.mean())
print("Decision Tree (AdaBoosting) - Train : %.2f" % metrics.accuracy_score(clf_DT_Boost.predict(X_train), y_train))
print("Decision Tree (AdaBoosting) - Test : %.2f" % metrics.accuracy_score(clf_DT_Boost.predict(X_test), y_test))


4-16. 勾配ブースティング

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
# Gradient Boostingを100回繰り返す
clf_GBT = GradientBoostingClassifier(n_estimators=num_trees, learning_rate=0.1, random_state=2019).fit(X_train, y_train)
results = model_selection.cross_val_score(clf_GBT, X_train, y_train, cv=kfold)
print ("\nGradient Boosting - CV Train : %.2f" % results.mean())
print ("Gradient Boosting - Train : %.2f" % metrics.accuracy_score(clf_GBT.predict(X_train), y_train))
print ("Gradient Boosting - Test : %.2f" % metrics.accuracy_score(clf_GBT.predict(X_test), y_test))


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
df= pd.read_csv('Data/digit.csv')
X = df.iloc[:,1:17].values
y = df['lettr'].values

# 訓練データセットとテストデータセットに分けてモデルを評価
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2019)
kfold = model_selection.StratifiedKFold(n_splits=5)
num_trees = 10
clf_GBT = GradientBoostingClassifier(n_estimators=num_trees, learning_rate=0.1, random_state=2019).fit(X_train, y_train)
results = model_selection.cross_val_score(clf_GBT, X_train, y_train, cv=kfold)
print ("\nGradient Boosting - Train : %.2f" % metrics.accuracy_score
(clf_GBT.predict(X_train), y_train))
print ("Gradient Boosting - Test : %.2f" % metrics.accuracy_score
(clf_GBT.predict(X_test), y_test))

# 'T'という文字を予測して、予測精度がブースティングの反復ごとにどのように変化するかをみてみよう
X_valid= (2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8)
print ("Predicted letter: ", clf_GBT.predict([X_valid]))

# 各段階ではブースティングの各反復で予測される確率を与える
stage_preds = list(clf_GBT.staged_predict_proba([X_valid]))
final_preds = clf_GBT.predict_proba([X_valid])

# プロット
x = range(1,27)
label = np.unique(df['lettr'])
plt.figure(figsize=(10,3))
plt.subplot(131)
plt.bar(x, stage_preds[0][0], align='center')
plt.xticks(x, label)
plt.xlabel('Label')
plt.ylabel('Prediction Probability')
plt.title('Round One')
plt.autoscale()
plt.subplot(132)
plt.bar(x, stage_preds[5][0],align='center')
plt.xticks(x, label)
plt.xlabel('Label')
plt.ylabel('Prediction Probability')
plt.title('Round Five')
plt.autoscale()
plt.subplot(133)
plt.bar(x, stage_preds[9][0],align='center')
plt.xticks(x, label)
plt.autoscale()
plt.xlabel('Label')
plt.ylabel('Prediction Probability')
plt.title('Round Ten')
plt.tight_layout()
plt.show()


 4-17. sklearnのラッパーを使ったxgboostr

In [None]:
# xgboostをpip install しておく
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

# データの読み込み
df = pd.read_csv("Data/Diabetes.csv")
predictors = ['age','serum_insulin']
target = 'class'

# 一般的な前処理としてラベルのエンコーディングと欠損値の処理を行う
from sklearn import preprocessing
for f in df.columns:
    if df[f].dtype=='object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(df[f].values))
        df[f] = lbl.transform(list(df[f].values))

df.fillna((-999), inplace=True)

# 決定木を構築するために弱い特徴量をいくつか使ってみよう
X = df[['age','serum_insulin']] # 独立変数
y = df['class'].values          # 従属変数

# 標準化
X = StandardScaler().fit_transform(X)

# 弱い特徴量を使って木を作成する
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2017)
num_rounds = 100
kfold = model_selection.StratifiedKFold(n_splits=5)
clf_XGB = XGBClassifier(n_estimators = num_rounds, objective= 'binary:logistic',seed=2017)

# early_stopping_rounds を使用し，スコアが向上しなかった場合に cv を停止する
clf_XGB.fit(X_train,y_train, early_stopping_rounds=20, eval_set=[(X_test, y_test)], verbose=False)
results = model_selection.cross_val_score(clf_XGB, X_train,y_train, cv=kfold)
print ("\nxgBoost - CV Train : %.2f" % results.mean())
print ("xgBoost - Train : %.2f" % metrics.accuracy_score(clf_XGB.predict
(X_train), y_train))
print ("xgBoost - Test : %.2f" % metrics.accuracy_score(clf_XGB.predict
(X_test), y_test))


 4-18. Pythonのネイティブパッケージを使ったxgboost

In [None]:
xgtrain = xgb.DMatrix(X_train, label=y_train, missing=-999)
xgtest = xgb.DMatrix(X_test, label=y_test, missing=-999)

# xgboostのパラメータを設定
param = {'max_depth': 3,  # 各木の最大の深さ
         'objective': 'binary:logistic'}
clf_xgb_cv = xgb.cv(param, xgtrain, num_rounds,
                    stratified=True,
                    nfold=5,
                    early_stopping_rounds=20,
                    seed=2017)
print ("Optimal number of trees/estimators is %i" % clf_xgb_cv.shape[0])
watchlist  = [(xgtest,'test'), (xgtrain,'train')]
clf_xgb = xgb.train(param, xgtrain,clf_xgb_cv.shape[0], watchlist)

# predict関数で確率を生成
# 0.5のカットオフを使って確率をクラスラベルに変換
y_train_pred = (clf_xgb.predict(xgtrain, ntree_limit=clf_xgb.best_iteration) > 0.5).astype(int)
y_test_pred = (clf_xgb.predict(xgtest, ntree_limit=clf_xgb.best_iteration) > 0.5).astype(int)
print ("XGB - Train : %.2f" % metrics.accuracy_score(y_train_pred, y_train))
print ("XGB - Test : %.2f" % metrics.accuracy_score(y_test_pred, y_test))

4-19. アンサンブルモデル

In [None]:
pip install mlxtend --upgrade

In [None]:
# mlxtendをpip install しておく
import pandas as pd
import numpy as np

# 擬似乱数の生成
np.random.seed(2017)
import statsmodels.api as sm
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier

# 現在はsklearnではなくmlxtendの一部として利用可能
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn import model_selection
from sklearn import metrics
from sklearn.model_selection import train_test_split

# データの読み込み
df = pd.read_csv("Data/Diabetes.csv")
X = df.iloc[:,:8]     # 独立変数
y = df['class']       # 従属変数

# 訓練データセットとテストデータセットに分けてモデルを評価
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2017)
LR = LogisticRegression(random_state=2017, max_iter=200)
RF = RandomForestClassifier(n_estimators = 100, random_state=2017)
SVM = SVC(random_state=0, probability=True)
KNC = KNeighborsClassifier()
DTC = DecisionTreeClassifier()
ABC = AdaBoostClassifier(n_estimators = 100)
BC = BaggingClassifier(n_estimators = 100)
GBC = GradientBoostingClassifier(n_estimators = 100)
clfs = []
print('5-fold cross validation:\n')
for clf, label in zip([LR, RF, SVM, KNC, DTC, ABC, BC, GBC],
                      ['Logistic Regression',
                       'Random Forest',
                       'Support Vector Machine',
                       'KNeighbors',
                       'Decision Tree',
                       'Ada Boost',
                       'Bagging',
                       'Gradient Boosting']):
    
    scores = model_selection.cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy') 
    print("Train CV Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    md = clf.fit(X, y)
    clfs.append(md)
    print("Test Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(X_test), y_test)))


4-20. アンサンブル投票モデル

In [None]:
# ### アンサンブル投票
clfs = []
print('5-fold cross validation:\n')
ECH = EnsembleVoteClassifier(clfs=[LR, RF, GBC], voting='hard')
ECS = EnsembleVoteClassifier(clfs=[LR, RF, GBC], voting='soft', weights=[1,1,1])
for clf, label in zip([ECH, ECS],
                      ['Ensemble Hard Voting',
                       'Ensemble Soft Voting']):
    scores = model_selection.cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    print("Train CV Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    md = clf.fit(X, y)
    clfs.append(md)
    print("Test Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict
    (X_test), y_test)))


4-21. スタッキングモデル

In [None]:
# クラス分類
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

seed = 2019
np.random.seed(seed)  # 乱数の初期化
# データの読み込み
df = pd.read_csv("Data/Diabetes.csv")
X = df.iloc[:,0:8] # 独立変数
y = df['class'].values     # 従属変数

# 正規化
X = StandardScaler().fit_transform(X)
# 訓練データセットとテストデータセットに分けて評価
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
kfold = model_selection.StratifiedKFold(n_splits=5)
num_trees = 10
verbose = True # to print the progress
clfs = [KNeighborsClassifier(),
        RandomForestClassifier(n_estimators=num_trees, random_state=seed),
        
GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)]
# ブレンド用の訓練データセットとテストデータセットの作成
dataset_blend_train = np.zeros((X_train.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_test.shape[0], len(clfs)))
print('5-fold cross validation:\n')
for i, clf in enumerate(clfs):
    scores = model_selection.cross_val_score(clf, X_train, y_train, cv=kfold, scoring='accuracy')
    print("##### Base Model %0.0f #####" % i)
    print("Train CV Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
    clf.fit(X_train, y_train)   
    print("Train Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(X_train), y_train)))
    dataset_blend_train[:,i] = clf.predict_proba(X_train)[:, 1]
    dataset_blend_test[:,i] = clf.predict_proba(X_test)[:, 1]  
    print("Test Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(X_test), y_test)))

print ("##### Meta Model #####")
clf = LogisticRegression()
scores = model_selection.cross_val_score(clf, dataset_blend_train, y_train, cv=kfold, scoring='accuracy')
clf.fit(dataset_blend_train, y_train)
print("Train CV Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
print("Train Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(dataset_blend_train), y_train)))
print("Test Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(dataset_blend_test), y_test)))


4-22.  ハイパーパラメータ調整のためのグリッドサーチ

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
seed = 2017
# データの読み込み

df = pd.read_csv("Data/Diabetes.csv")
X = df.iloc[:,:8].values     # 独立変数
y = df['class'].values       # 従属変数

# 正規化
X = StandardScaler().fit_transform(X)

# 訓練データセットとテストデータセットに分けてモデルを評価
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
kfold = model_selection.StratifiedKFold(n_splits=5)
num_trees = 100
clf_rf = RandomForestClassifier(random_state=seed).fit(X_train, y_train)
rf_params = {
    'n_estimators': [100, 250, 500, 750, 1000],
    'criterion':  ['gini', 'entropy'],
    'max_features': [None, 'auto', 'sqrt', 'log2'],
    'max_depth': [1, 3, 5, 7, 9]
}

# verbose = 10を設定するとタスクが10回完了するごとに進捗状況が表示される
grid = GridSearchCV(clf_rf, rf_params, scoring='roc_auc', cv=kfold, verbose=10, n_jobs=-1)
grid.fit(X_train, y_train)
print ('Best Parameters: ', grid.best_params_)
results = model_selection.cross_val_score(grid.best_estimator_, X_train,y_train, cv=kfold)
print ("Accuracy - Train CV: ", results.mean())
print ("Accuracy - Train : ", metrics.accuracy_score(grid.best_estimator_.predict(X_train), y_train))
print ("Accuracy - Test : ", metrics.accuracy_score(grid.best_estimator_.predict(X_test), y_test))


4-23.ハイパーパラメータ調整のためのランダムサーチ

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

# サンプリングするパラメータと分布を指定する
param_dist = {'n_estimators':sp_randint(100,1000),
              'criterion': ['gini', 'entropy'],
              'max_features': [None, 'auto', 'sqrt', 'log2'],
              'max_depth': [None, 1, 3, 5, 7, 9]
             }
# ランダムサーチを実行
n_iter_search = 20
random_search = RandomizedSearchCV(clf_rf, param_distributions=param_dist, cv=kfold, n_iter=n_iter_search, verbose=10, n_jobs=-1, random_state=seed)
random_search.fit(X_train, y_train)
print ('Best Parameters: ', random_search.best_params_)
results = model_selection.cross_val_score(random_search.best_estimator_, X_train,y_train, cv=kfold)
print ("Accuracy - Train CV: ", results.mean())
print ("Accuracy - Train : ", metrics.accuracy_score(random_search.best_estimator_.predict(X_train), y_train))
print ("Accuracy - Test : ", metrics.accuracy_score(random_search.best_estimator_.predict(X_test), y_test))


 4-24. ハイパーパラメータ調整のためのベイズ最適化

In [None]:
pip install bayesian-optimization

In [None]:
# baysian-optimizationをpip installしておく
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
from bayes_opt.util import Colours
from sklearn.ensemble import RandomForestClassifier as RFC
def rfc_cv(n_estimators, min_samples_split, max_features, data, targets):
    """
    ランダムフォレストのクロスバリデーション

    この関数はn_estimators、min_samples_split、max_featuresをパラメータとして，ランダムフォレスト分類器をインスタンス化する。これにデータとターゲットを組み合わせてクロスバリデーションを行う。ここでの我々の目標はlog lossを最小化するn_estimators, min_samples_split, max_featuresの組み合わせを見つけることである。
    """
    estimator = RFC(
        n_estimators=n_estimators,
        min_samples_split=min_samples_split,
        max_features=max_features,
        random_state=2
    )
    cval = cross_val_score(estimator, data, targets,
                           scoring='neg_log_loss', cv=4)
    return cval.mean()
def optimize_rfc(data, targets):
    """ランダムフォレストのパラメータにベイズ最適化を適用."""
    def rfc_crossval(n_estimators, min_samples_split, max_features):
        """ 
        RandomForestクロスバリデーションのラッパー
        n_estimatorsとmin_samples_splitを渡す前に，integerにキャストしていることに注目してほしい。さらにmax_featuresが(0, 1)の範囲外の値を取ることを避けるためにそれに応じてキャップされていることも確認している
        """
        return rfc_cv(
            n_estimators=int(n_estimators),
            min_samples_split=int(min_samples_split),
            max_features=max(min(max_features, 0.999), 1e-3),
            data=data,
            targets=targets,
        )
    optimizer = BayesianOptimization(
        f=rfc_crossval,
        pbounds={
            "n_estimators": (10, 250),
            "min_samples_split": (2, 25),
            "max_features": (0.1, 0.999),
        },
        random_state=1234,
        verbose=2
    )
    optimizer.maximize(n_iter=10)
    print("Final result:", optimizer.max)
    return optimizer
print(Colours.green("--- Optimizing Random Forest ---"))
optimize_rfc(X_train, y_train)


4-25. ウェーブレット変換の実装

In [None]:
import pywt
from statsmodels.robust import mad
import pandas as pd
import numpy as np
df = pd.read_csv('Data/Temperature.csv')

# ウェーブレット変換を用いてセンサデータをノイズ除去する
def wp_denoise(df):
    for column in df:
        x = df[column]
        wp = pywt.WaveletPacket(data=x, wavelet='db7', mode='symmetric')
        new_wp = pywt.WaveletPacket(data=None, wavelet='db7', mode='sym')
        for i in range(wp.maxlevel):
            nodes = [node.path for node in wp.get_level(i, 'natural')]
           # ハイパス、ローパス信号の除去
            for node in nodes:
                sigma = mad(wp[node].data)
                uthresh = sigma * np.sqrt( 2*np.log( len( wp[node].data ) ) )
                new_wp[node] = pywt.threshold(wp[node].data, value=uthresh, mode='soft')
    y = new_wp.reconstruct(update=False)[:len(x)]
    df[column] = y
    return df
    
# センサーデータのノイズを除去
df_denoised = wp_denoise(df.iloc[:,3:4])
df['Date'] = pd.to_datetime(df['Date'])
plt.figure(1)
ax1 = plt.subplot(221)
df['4030CFDC'].plot(ax=ax1, figsize=(8, 8), title='Signal with noise')
ax2 = plt.subplot(222)
df_denoised['4030CFDC'].plot(ax=ax2, figsize=(8, 8), title='Signal without noise')
plt.tight_layout()