<a href="https://colab.research.google.com/github/entanglement-nak/portfolio-nak/blob/main/XGboost_memory_profiler_%E3%82%BF%E3%82%A4%E3%83%A0%E3%83%A2%E3%82%B8%E3%83%A5%E3%83%BC%E3%83%AB%E3%81%AE%E3%82%A2%E3%83%89%E3%83%90%E3%82%A4%E3%82%B9%E6%9C%89.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install memory_profiler

Collecting memory_profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Installing collected packages: memory_profiler
Successfully installed memory_profiler-0.61.0


In [None]:
%load_ext memory_profiler

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
from sklearn.utils import resample
from xgboost import XGBClassifier
from memory_profiler import profile

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# データの読み込み関数
# profile
def load_data(train_path, test_path):
    train_df = pd.read_excel(train_path, header=None)
    test_df = pd.read_excel(test_path, header=None)
    return train_df, test_df

In [None]:
# データの前処理
# profile
def preprocess_data(train_df, test_df):
    combined_data = pd.concat([train_df, test_df], axis=0)
    combined_data_filled = combined_data.fillna(combined_data.mean())
    combined_data_cleaned = combined_data_filled.replace([np.inf, -np.inf], np.nan).fillna(combined_data_filled.mean())
    X = combined_data_cleaned.drop(combined_data_cleaned.columns[0], axis=1)
    y = combined_data_cleaned[combined_data_cleaned.columns[0]].map({-1: 0, 1: 1})  # ラベルの変換
    return X, y

In [None]:
# データの分割とアップサンプリング
# profile
def split_and_upsample(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    df_train = pd.concat([X_train, y_train], axis=1)
    # 正常クラスと異常クラスの分離
    df_normal_train = df_train[df_train[df_train.columns[-1]] == 1]
    df_anomaly_train = df_train[df_train[df_train.columns[-1]] == 0]
    # 異常クラスのアップサンプリング
    df_anomaly_upsampled = resample(df_anomaly_train, replace=True, n_samples=len(df_normal_train), random_state=123)
    df_upsampled_train = pd.concat([df_normal_train, df_anomaly_upsampled])
    X_train_upsampled = df_upsampled_train.drop(df_upsampled_train.columns[-1], axis=1)
    y_train_upsampled = df_upsampled_train[df_upsampled_train.columns[-1]]
    return X_train_upsampled, y_train_upsampled, X_test, y_test

In [None]:
# モデルの訓練
# profile
def train_model(X_train_upsampled, y_train_upsampled):
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train_upsampled, y_train_upsampled)
    return model

In [None]:
# メイン関数
def main():
    # ファイルパスの定義
    train_path = r"/content/drive/MyDrive/Wafer/Wafer_TRAIN.xlsx"
    test_path = r"/content/drive/MyDrive/Wafer/Wafer_TEST.xlsx"

    # データの読み込み
    train_df, test_df = load_data(train_path, test_path)
    # データの前処理
    X, y = preprocess_data(train_df, test_df)
    # データの分割とアップサンプリング
    X_train_upsampled, y_train_upsampled, X_test, y_test = split_and_upsample(X, y)

    # モデルのトレーニングと評価
    model = train_model(X_train_upsampled, y_train_upsampled)
    accuracy, macro_f1, weighted_f1, mcc = predict_and_evaluate(model, X_test, y_test)

    print(f"Accuracy: {accuracy}, Macro F1: {macro_f1}, Weighted F1: {weighted_f1}, MCC: {mcc}")

In [None]:
# 予測と評価
# profile
def predict_and_evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    weighted_f1 = f1_score(y_test, y_pred, average='weighted')
    mcc = matthews_corrcoef(y_test, y_pred)
    return accuracy, macro_f1, weighted_f1, mcc

In [None]:
for i in range(10):
  print(f"{i+1}回目の計測")
  %memit main()
  print("\n")

1回目の計測
Accuracy: 0.9986046511627907, Macro F1: 0.9961636258196904, Weighted F1: 0.9986089211779682, MCC: 0.992356503372486
peak memory: 334.95 MiB, increment: 136.58 MiB


2回目の計測
Accuracy: 0.9986046511627907, Macro F1: 0.9961636258196904, Weighted F1: 0.9986089211779682, MCC: 0.992356503372486
peak memory: 363.86 MiB, increment: 88.23 MiB


3回目の計測
Accuracy: 0.9986046511627907, Macro F1: 0.9961636258196904, Weighted F1: 0.9986089211779682, MCC: 0.992356503372486
peak memory: 360.05 MiB, increment: 80.84 MiB


4回目の計測
Accuracy: 0.9986046511627907, Macro F1: 0.9961636258196904, Weighted F1: 0.9986089211779682, MCC: 0.992356503372486
peak memory: 364.29 MiB, increment: 83.59 MiB


5回目の計測
Accuracy: 0.9986046511627907, Macro F1: 0.9961636258196904, Weighted F1: 0.9986089211779682, MCC: 0.992356503372486
peak memory: 365.20 MiB, increment: 82.99 MiB


6回目の計測
Accuracy: 0.9986046511627907, Macro F1: 0.9961636258196904, Weighted F1: 0.9986089211779682, MCC: 0.992356503372486
peak memory: 361.77 M

In [None]:
def main():
    # ファイルパスの定義
    train_path = r"/content/drive/MyDrive/Wafer/Wafer_TRAIN.xlsx"
    test_path = r"/content/drive/MyDrive/Wafer/Wafer_TEST.xlsx"

    # データの読み込み
    train_df, test_df = load_data(train_path, test_path)

    # データの前処理
    X, y = preprocess_data(train_df, test_df)

    # データの分割とアップサンプリング
    X_train_upsampled, y_train_upsampled, X_test, y_test = split_and_upsample(X, y)

    # モデルのトレーニングと評価
    xgb = XGBClassifier()
    xgb.fit(X_train_upsampled, y_train_upsampled)
    y_pred = xgb.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    MCC = matthews_corrcoef(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, average='binary')
    report = classification_report(y_test, y_pred)

    # 評価結果の出力
    print("\n")
    print(f"正解率：{accuracy}")
    print(f"MCC：{MCC}")
    print(f"F1：{F1}")
    print(report)

if __name__ == "__main__":
    main()

KeyboardInterrupt: 

In [None]:
import time

# ファイルパスの定義
train_path = r"/content/drive/MyDrive/Wafer/Wafer_TRAIN.xlsx"
test_path = r"/content/drive/MyDrive/Wafer/Wafer_TEST.xlsx"

# データの読み込み
train_df, test_df = load_data(train_path, test_path)

# データの前処理
X, y = preprocess_data(train_df, test_df)
y.replace({-1: 0, 1:1}, inplace=True)

# データの分割とアップサンプリング
X_train_upsampled, y_train_upsampled, X_test, y_test = split_and_upsample(X, y)

# モデルのトレーニングと評価
xgb = XGBClassifier()
xgb.fit(X_train_upsampled, y_train_upsampled)

#予測のステップだけ10回計測する
for i in range(10):
  print(f"{i+1}回目の計測")
  # 予測時間の計測を開始
  start_time = time.time()
  %memit xgb.predict(X_test)
  # 予測時間の計測を終了
  end_time = time.time()
  # 予測にかかった時間を計算
  elapsed_time = end_time - start_time
  print(f"予測にかかった時間: {elapsed_time} 秒")
  print("\n")

1回目の計測
peak memory: 393.05 MiB, increment: 0.00 MiB
予測にかかった時間: 0.27373790740966797 秒


2回目の計測
peak memory: 393.08 MiB, increment: 0.00 MiB
予測にかかった時間: 0.2836177349090576 秒


3回目の計測
peak memory: 393.11 MiB, increment: 0.03 MiB
予測にかかった時間: 0.285442590713501 秒


4回目の計測
peak memory: 393.12 MiB, increment: 0.01 MiB
予測にかかった時間: 0.27779531478881836 秒


5回目の計測
peak memory: 393.12 MiB, increment: 0.00 MiB
予測にかかった時間: 0.2963902950286865 秒


6回目の計測
peak memory: 393.17 MiB, increment: 0.00 MiB
予測にかかった時間: 0.2857241630554199 秒


7回目の計測
peak memory: 393.17 MiB, increment: 0.00 MiB
予測にかかった時間: 0.28411269187927246 秒


8回目の計測
peak memory: 393.17 MiB, increment: 0.00 MiB
予測にかかった時間: 0.28944873809814453 秒


9回目の計測
peak memory: 393.17 MiB, increment: 0.00 MiB
予測にかかった時間: 0.2859621047973633 秒


10回目の計測
peak memory: 393.17 MiB, increment: 0.00 MiB
予測にかかった時間: 0.2930879592895508 秒


