<a href="https://colab.research.google.com/github/entanglement-nak/portfolio-nak/blob/main/%E3%83%9D%E3%83%BC%E3%83%88%E3%83%95%E3%82%A9%E3%83%AA%E3%82%AA%E3%80%80%E6%B1%BA%E5%AE%9A%E6%9C%A8%E5%88%86%E9%A1%9E.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.utils import resample

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# データパス
train_path = r"/content/drive/MyDrive/Wafer/Wafer_TRAIN.xlsx"

test_path = r"/content/drive/MyDrive/Wafer/Wafer_TEST.xlsx"

In [None]:
train_df = pd.read_excel(train_path, header=None)

test_df = pd.read_excel(test_path, header=None)

In [None]:
# トレインデータとテストデータを縦に統合（スタック）
combined_data = pd.concat([train_df, test_df], axis=0)

In [None]:
# データを特徴量(X)とターゲット(y)に分割
X = combined_data.drop(combined_data.columns[0], axis=1)
y = combined_data[combined_data.columns[0]]

In [None]:
# データを訓練用とテスト用に7:3の比率で分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# 訓練データ内で正常と異常のクラスを分離し、異常クラスのアップサンプリングを行う
df_train = pd.concat([X_train, y_train], axis=1)
df_normal_train = df_train[df_train[df_train.columns[-1]] == 1]
df_anomaly_train = df_train[df_train[df_train.columns[-1]] == -1]

df_anomaly_upsampled = resample(df_anomaly_train, replace=True, n_samples=len(df_normal_train), random_state=123)
df_upsampled_train = pd.concat([df_normal_train, df_anomaly_upsampled])

In [None]:
# アップサンプリングされたデータを特徴量(X)とターゲット(y)に分割
X_train_upsampled = df_upsampled_train.drop(df_upsampled_train.columns[-1], axis=1)
y_train_upsampled = df_upsampled_train[df_upsampled_train.columns[-1]]

In [None]:
# 決定木分類器の初期化（深さを10に制限）
decision_tree = DecisionTreeClassifier(max_depth=10)

In [None]:
# 訓練データでモデルを訓練
decision_tree.fit(X_train_upsampled, y_train_upsampled)

In [None]:
# テストデータでの予測
y_pred = decision_tree.predict(X_test)

In [None]:
# 正確さ（Accuracy）とF1スコアの計算
accuracy = accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')
weighted_f1 = f1_score(y_test, y_pred, average='weighted')

In [None]:
# 決定木の深さを取得
tree_depth = decision_tree.get_depth()

accuracy, macro_f1, weighted_f1, tree_depth

(0.9790697674418605, 0.9467492260061919, 0.9798994888040896, 10)

    上記の結果は…
    正確さ（Accuracy）: 0.979
    マクロ平均F1スコア: 0.946
    重み付き平均F1スコア: 0.979



↓↓過学習していないかチェックを行う。↓↓

In [None]:
# マクロ平均F1スコアと重み付き平均F1スコアの計算
macro_f1 = f1_score(y_test, y_pred, average='macro')
weighted_f1 = f1_score(y_test, y_pred, average='weighted')

# 過学習の判断のために、訓練データとテストデータでの正確さ（Accuracy）を比較
train_accuracy = accuracy_score(y_train_upsampled, decision_tree.predict(X_train_upsampled))
test_accuracy = accuracy_score(y_test, y_pred)

macro_f1, weighted_f1, train_accuracy, test_accuracy

(0.9467492260061919,
 0.9798994888040896,
 0.9893688451208594,
 0.9790697674418605)

以下はベストなハイパーパラメーターを探すため、RandomizedSearchCVのライブラリを利用してみる。

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
param_dist = {
    'max_depth': [ 15, 16, 17, 18, 19, 20, 21],
}

In [None]:
tree_classifier = DecisionTreeClassifier()

In [None]:
random_search = RandomizedSearchCV(tree_classifier, param_distributions=param_dist, n_iter=10, scoring='accuracy', n_jobs=-1, cv=5, random_state=42)

探索を行う。

In [None]:
random_search.fit(X, y)



In [None]:
print("Best Hyperparameters: ", random_search.best_params_)
print("Best Score: ", random_search.best_score_)

Best Hyperparameters:  {'max_depth': 17}
Best Score:  0.9902289216278698


    わずかに上昇するのが確認。