<a href="https://colab.research.google.com/github/entanglement-nak/portfolio-nak/blob/main/%E3%83%9D%E3%83%BC%E3%83%88%E3%83%95%E3%82%A9%E3%83%AA%E3%82%AA%E3%80%80%E3%83%AD%E3%82%B8%E3%82%B9%E3%83%86%E3%82%A3%E3%83%83%E3%82%AF%E5%9B%9E%E5%B8%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.utils import resample

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# データパス
train_path = r"/content/drive/MyDrive/Wafer/Wafer_TRAIN.xlsx"

test_path = r"/content/drive/MyDrive/Wafer/Wafer_TEST.xlsx"

In [None]:
train_df = pd.read_excel(train_path, header=None)

test_df = pd.read_excel(test_path, header=None)

In [None]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,143,144,145,146,147,148,149,150,151,152
0,1,-1.602294,-1.670823,-1.693666,-1.699377,-1.699377,-1.70366,-1.70366,-1.70366,-1.70366,...,-1.145432,-1.145432,-1.145432,-1.145432,-1.145432,-1.145432,-1.145432,-1.145432,-1.145432,-1.145432
1,1,1.084591,1.084591,1.084591,1.065308,1.065308,1.065308,1.065308,1.065308,1.065308,...,1.065308,1.065308,1.065308,1.065308,1.065308,1.065308,1.065308,1.065308,1.065308,1.065308
2,1,0.362689,0.362689,0.362689,0.393316,0.362689,0.362689,0.362689,0.362689,0.362689,...,0.393316,0.393316,0.393316,0.362689,0.393316,0.393316,0.393316,0.393316,0.393316,0.393316
3,1,-1.094523,-1.094523,-1.094523,-1.096732,-1.094523,-1.096732,-1.094523,-1.094523,-1.094523,...,-1.096732,-1.096732,-1.096732,-1.096732,-1.096732,-1.096732,-1.096732,-1.096732,-1.096732,-1.096732
4,1,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,...,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761,-1.103761


In [None]:
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,143,144,145,146,147,148,149,150,151,152
0,1,-1.079707,-1.083256,-1.079707,-1.079707,-1.079707,-1.079707,-1.079707,-1.079707,-1.079707,...,-1.079707,-1.079707,-1.079707,-1.079707,-1.079707,-1.079707,-1.079707,-1.079707,-1.079707,-1.079707
1,1,-1.080197,-1.078021,-1.078021,-1.078021,-1.078021,-1.078021,-1.080197,-1.078021,-1.078021,...,-1.11067,-1.112846,-1.115023,-1.117199,-1.119376,-1.121553,-1.123729,-1.125906,-1.128082,-1.130259
2,1,-1.15406,-1.15406,-1.15406,-1.15406,-1.15406,-1.15406,-1.15406,-1.15406,-1.15406,...,-1.15406,-1.15406,-1.15406,-1.15406,-1.15406,-1.15406,-1.15406,-1.15406,-1.15406,-1.15406
3,1,-1.658965,-1.696605,-1.705291,-1.711082,-1.711082,-1.715425,-1.715425,-1.715425,-1.715425,...,-1.149382,-1.149382,-1.149382,-1.149382,-1.149382,-1.149382,-1.149382,-1.149382,-1.149382,-1.149382
4,1,-1.201221,-1.199042,-1.199042,-1.201221,-1.199042,-1.199042,-1.199042,-1.199042,-1.199042,...,-1.185968,-1.183789,-1.18161,-1.179431,-1.177251,-1.175072,-1.172893,-1.170714,-1.168535,-1.166356


In [None]:
# トレインデータとテストデータを縦に統合（スタック）
combined_data = pd.concat([train_df, test_df], axis=0)

In [None]:
# NaN値を各列の平均値で埋める
combined_data_filled = combined_data.fillna(combined_data.mean())

In [None]:
# 無限値をチェックし、存在する場合は置換
combined_data_cleaned = combined_data_filled.replace([np.inf, -np.inf], np.nan).fillna(combined_data_filled.mean())

In [None]:
# データを特徴量(X)とターゲット(y)に分割
X = combined_data_cleaned.drop(combined_data_cleaned.columns[0], axis=1)
y = combined_data_cleaned[combined_data_cleaned.columns[0]]

In [None]:
# データを訓練用とテスト用に7:3の比率で分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# 訓練データ内で正常と異常のクラスを分離し、異常クラスのアップサンプリングを行う
df_train = pd.concat([X_train, y_train], axis=1)
df_normal_train = df_train[df_train[df_train.columns[-1]] == 1]
df_anomaly_train = df_train[df_train[df_train.columns[-1]] == -1]

df_anomaly_upsampled = resample(df_anomaly_train, replace=True, n_samples=len(df_normal_train), random_state=123)
df_upsampled_train = pd.concat([df_normal_train, df_anomaly_upsampled])

In [None]:
# アップサンプリングされたデータを特徴量(X)とターゲット(y)に分割
X_train_upsampled = df_upsampled_train.drop(df_upsampled_train.columns[-1], axis=1)
y_train_upsampled = df_upsampled_train[df_upsampled_train.columns[-1]]

In [None]:
# ロジスティック回帰モデルの初期化

logreg = LogisticRegression(max_iter=1000)

In [None]:
def train_model(X, y):
    model = LogisticRegression(max_iter=1000)
    model.fit(X, y)
    return model

# ロジスティック回帰モデルの初期化と訓練
logreg_model = train_model(X_train_upsampled, y_train_upsampled)

In [None]:
# 訓練データでモデルを訓練

logreg.fit(X_train_upsampled, y_train_upsampled)

In [None]:
# テストデータでの予測
y_pred = logreg.predict(X_test)

In [None]:
# 正確さ（Accuracy）とF1スコアの計算
accuracy = accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')
weighted_f1 = f1_score(y_test, y_pred, average='weighted')

accuracy, macro_f1, weighted_f1

(0.8944186046511627, 0.7880780956425417, 0.9080340998952081)

    テストデータの結果は…
    正確さ（Accuracy）: 0.89
    マクロ平均F1スコア: 0.79
    重み付き平均F1スコア: 0.91

↓↓過学習していないかチェックを行う。↓↓

In [None]:
# 訓練データでの予測
y_train_pred = logreg.predict(X_train_upsampled)

In [None]:
# 訓練データの性能指標を計算
train_accuracy = accuracy_score(y_train_upsampled, y_train_pred)
train_macro_f1 = f1_score(y_train_upsampled, y_train_pred, average='macro')
train_weighted_f1 = f1_score(y_train_upsampled, y_train_pred, average='weighted')

In [None]:
# テストデータでの性能指標は既に計算されている
train_accuracy, train_macro_f1, train_weighted_f1, accuracy, macro_f1, weighted_f1

(0.9090196956132498,
 0.9089789783111548,
 0.9089789783111548,
 0.8944186046511627,
 0.7880780956425417,
 0.9080340998952081)

    訓練データの結果は…
    正確さ（Accuracy）: 0.91
    マクロ平均F1スコア: 0.91
    重み付き平均F1スコア: 0.91

    テストデータの結果は…
    正確さ（Accuracy）: 0.89
    マクロ平均F1スコア: 0.79
    重み付き平均F1スコア: 0.90

以下はベストなハイパーパラメーターを探すため、RandomizedSearchCVのライブラリを利用してみる。

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# ハイパーパラメーターの範囲を指定
param_dist = {
    'C': np.logspace(np.log10(2900), np.log10(3500), 7),
    'solver': ['liblinear', 'saga']
}

# モデルの初期化
logreg = LogisticRegression()

# RandomizedSearchCVを設定
random_search = RandomizedSearchCV(logreg, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy')

# ハイパーパラメーターチューニングを実行
# X と y はトレーニングデータとラベルです
random_search.fit(X, y)

# 最適なハイパーパラメーターを表示
print("Best hyperparameters:", random_search.best_params_)



Best hyperparameters: {'solver': 'liblinear', 'C': 2992.331279006868}


    RandomizedSearchCVのパラメーターのチューニングの結果、「2992」が最適であるという結果に。
    再度、訓練を行ってみる。

In [None]:
# ロジスティック回帰モデルの初期化
logreg = LogisticRegression(max_iter=2992)

In [None]:
# 訓練データでモデルを訓練
logreg.fit(X_train_upsampled, y_train_upsampled)

In [None]:
# テストデータでの予測
y_pred = logreg.predict(X_test)

In [None]:
# 正確さ（Accuracy）とF1スコアの計算
accuracy = accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')
weighted_f1 = f1_score(y_test, y_pred, average='weighted')

accuracy, macro_f1, weighted_f1

(0.8944186046511627, 0.7880780956425417, 0.9080340998952081)

劇的な変化は見受けられなかった。