<a href="https://colab.research.google.com/github/entanglement-nak/portfolio-nak/blob/main/lightbgm_memory_profiler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install memory_profiler

Collecting memory_profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Installing collected packages: memory_profiler
Successfully installed memory_profiler-0.61.0


In [None]:
from memory_profiler import profile
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import classification_report, accuracy_score, f1_score, matthews_corrcoef
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import time

In [None]:
%load_ext memory_profiler

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# データの読み込み
def load_data(train_path, test_path):
    train_df = pd.read_excel(train_path, header=None)
    test_df = pd.read_excel(test_path, header=None)
    return train_df, test_df

In [None]:
# データの前処理
def preprocess_data(train_df, test_df):
    combined_data = pd.concat([train_df, test_df], axis=0)
    combined_data_filled = combined_data.fillna(combined_data.mean())
    combined_data_cleaned = combined_data_filled.replace([np.inf, -np.inf], np.nan).fillna(combined_data_filled.mean())
    X = combined_data_cleaned.drop(combined_data_cleaned.columns[0], axis=1)
    y = combined_data_cleaned[combined_data_cleaned.columns[0]]
    return X, y

In [None]:
# データの分割とアップサンプリング
def split_and_upsample(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    df_train = pd.concat([X_train, y_train], axis=1)
    df_normal_train = df_train[df_train[df_train.columns[-1]] == 1]
    df_anomaly_train = df_train[df_train[df_train.columns[-1]] == -1]
    df_anomaly_upsampled = resample(df_anomaly_train, replace=True, n_samples=len(df_normal_train), random_state=123)
    df_upsampled_train = pd.concat([df_normal_train, df_anomaly_upsampled])
    X_train_upsampled = df_upsampled_train.drop(df_upsampled_train.columns[-1], axis=1)
    y_train_upsampled = df_upsampled_train[df_upsampled_train.columns[-1]]
    return X_train_upsampled, y_train_upsampled, X_test, y_test

In [None]:
# モデルの訓練
def train_model(X_train_upsampled, y_train_upsampled):
    lgbm = lgb.LGBMClassifier(max_depth=10, learning_rate=0.1, n_estimators=100)
    lgbm.fit(X_train_upsampled, y_train_upsampled)
    return lgbm

In [None]:
# 予測と評価（時間計測付き）
def predict_and_evaluate(model, X_test, y_test):
    # 予測時間の計測を開始
    start_time = time.time()
    y_pred = model.predict(X_test)
    # 予測時間の計測を終了
    end_time = time.time()

    # 予測にかかった時間を計算
    elapsed_time = end_time - start_time

    # 評価指標の計算
    accuracy = accuracy_score(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    weighted_f1 = f1_score(y_test, y_pred, average='weighted')
    mcc = matthews_corrcoef(y_test, y_pred)

    return accuracy, macro_f1, weighted_f1, mcc, elapsed_time

In [None]:
# メイン関数
def main():
    # データパス
    train_path = r"/content/drive/MyDrive/Wafer/Wafer_TRAIN.xlsx"
    test_path = r"/content/drive/MyDrive/Wafer/Wafer_TEST.xlsx"

    # データの読み込み
    train_df, test_df = load_data(train_path, test_path)

    # データの前処理
    X, y = preprocess_data(train_df, test_df)

    # データの分割とアップサンプリング
    X_train_upsampled, y_train_upsampled, X_test, y_test = split_and_upsample(X, y)

    # モデルの訓練
    model = train_model(X_train_upsampled, y_train_upsampled)

    # 予測と評価
    accuracy, macro_f1, weighted_f1, mcc, elapsed_time = predict_and_evaluate(model, X_test, y_test)

    # 結果の出力
    print(f"Accuracy: {accuracy}, Macro F1: {macro_f1}, Weighted F1: {weighted_f1}, MCC: {mcc}")
    print(f"予測にかかった時間: {elapsed_time} 秒")

In [None]:
# 10回計測する
for i in range(10):
    print(f"{i+1}回目の計測")
    %memit main()
    print("\n")

1回目の計測
[LightGBM] [Info] Number of positive: 4468, number of negative: 4468
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018486 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38759
[LightGBM] [Info] Number of data points in the train set: 8936, number of used features: 152
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Accuracy: 0.9976744186046511, Macro F1: 0.9936060430328174, Weighted F1: 0.9976815352966135, MCC: 0.9872412146086262
予測にかかった時間: 0.011213541030883789 秒
peak memory: 325.06 MiB, increment: 115.03 MiB


2回目の計測
[LightGBM] [Info] Number of positive: 4468, number of negative: 4468
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030117 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38759
[LightGBM] [Info] Number of data points in the train set: 8936, number of used fea