<a href="https://colab.research.google.com/github/gchaewon/Euron5th_BusTayo/blob/main/modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 서울시 버스 승하차 예측 모델링

서울시 버스 승하차 예측 모델링

- LightGBM 모델을 사용

- 서울시 버스 승하차 인원 데이터 중 train – 22/9 ~ 23/8, test – 23/9 ~ 23/11 기간으로 하여 전처리한 데이터로 모델 학습

- scaler에 따른 정확도 비교도 진행


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings; warnings.filterwarnings('ignore')

In [3]:
!pip uninstall lightgbm
!pip install lightgbm==3.3.2

Found existing installation: lightgbm 4.1.0
Uninstalling lightgbm-4.1.0:
  Would remove:
    /usr/local/lib/python3.10/dist-packages/lightgbm-4.1.0.dist-info/*
    /usr/local/lib/python3.10/dist-packages/lightgbm/*
Proceed (Y/n)? Y
  Successfully uninstalled lightgbm-4.1.0
Collecting lightgbm==3.3.2
  Downloading lightgbm-3.3.2-py3-none-manylinux1_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.2


## 0. 데이터 로딩

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
path = '/content/drive/MyDrive/Euron/프로젝트/'

train = pd.read_csv(path+'train.csv')

print(train.shape)
train.head(3)

(450530, 19)


Unnamed: 0,month,bus_station_SID,bus_station_ID,route_ID,type,6~8_ride,8~10_ride,10~12_ride,12~14_ride,14~16_ride,16~18_ride,18~20_ride,6~8_takeoff,8~10_takeoff,10~12_takeoff,12~14_takeoff,14~16_takeoff,16~18_takeoff,18~20_takeoff
0,202209,100000001,77,123000010.0,0,654,894,809,725,576,563,574,354,559,753,835,769,781,800
1,202209,100000001,67,100100073.0,0,788,1232,1265,1269,1217,1157,1074,325,626,782,745,696,815,982
2,202209,100000002,31,100100549.0,0,227,566,677,419,282,208,193,708,1139,1114,703,504,449,450


In [6]:
test = pd.read_csv(path+'test.csv')

print(test.shape)
test.head(3)

(113146, 19)


Unnamed: 0,month,bus_station_SID,bus_station_ID,route_ID,type,6~8_ride,8~10_ride,10~12_ride,12~14_ride,14~16_ride,16~18_ride,18~20_ride,6~8_takeoff,8~10_takeoff,10~12_takeoff,12~14_takeoff,14~16_takeoff,16~18_takeoff,18~20_takeoff
0,202309,100000001,66,100100073.0,0,699,1160,1292,1196,1191,1130,988,349,629,716,711,788,904,882
1,202309,100000001,75,123000010.0,0,524,789,838,741,588,567,627,358,637,816,848,825,838,868
2,202309,100000002,31,100100549.0,0,258,439,546,434,313,272,256,681,998,977,693,464,355,366


## 1. 모델 학습에 필요한 함수 정의

In [7]:
import lightgbm as lgb
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV

In [8]:
# 스케일링 함수 정의
def scaler(type, train, test):
    if type == 'standard':
        scaler = StandardScaler()
    elif type == 'minmax':
        scaler = MinMaxScaler()
    elif type == 'robust':
        scaler = RobustScaler()

    # 입력 받은 타입에 맞는 스케일러로 스케일링
    train_scaled = pd.DataFrame(data=scaler.fit_transform(train), columns=train.columns)
    test_scaled = pd.DataFrame(data=scaler.fit_transform(test), columns=test.columns)
    train_scaled = train_scaled.astype('float')
    test_scaled = test_scaled.astype('float')

    # 타겟값 분리
    X_train = train_scaled.drop(columns=['18~20_ride'])
    y_train = train_scaled['18~20_ride']
    X_test = test_scaled.drop(columns=['18~20_ride'])
    y_test = test_scaled['18~20_ride']

    return X_train, X_test, y_train, y_test

In [9]:
# 모델 학습 함수 정의
def lgbm_modeling(X_train, X_test, y_train, y_test, type):
    # 학습 데이터를 학습 및 검증 데이터로 나누기
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    lgbm = lgb.LGBMRegressor(num_iterations = 1000,
                              learning_rate = 0.1,
                              metric='l2',
                              force_col_wise=True,
                              n_jobs=-1,
                             verbosity=0)
    # 검증 데이터를 eval_set에 추가
    eval_set = [(X_valid, y_valid)]
    # tqdm을 사용하여 학습 진행률 표시
    for i in tqdm(range(0, len(X_train), 10000)):
        X_batch = X_train[i:i+10000]
        y_batch = y_train[i:i+10000]

        lgbm.fit(X_batch, y_batch, eval_set=eval_set, eval_metric='mse', early_stopping_rounds=10, verbose=False)

        # 각 회차의 MSE 계산
        if i % 10000 == 0:  # 매 10000회차마다 MSE 출력
            y_pred = lgbm.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            print(f"Iteration {i}: Test MSE = {mse:.4f}")

    # 테스트 데이터로 예측 수행
    y_pred = lgbm.predict(X_test)

    # 최종 MSE, R2 score 출력
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'LGBM - {type} scaling MSE: {mse:.4f}, R2 Score: {r2:.4f}')

    return lgbm

In [10]:
def linear_modeling(X_train, X_test, y_train, y_test, type):
    # 학습 데이터를 학습 및 검증 데이터로 나눔
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    # 선형 회귀 모델 생성 및 학습
    linear_rg = LinearRegression()
    linear_rg.fit(X_train, y_train)

    # 검증 데이터로 예측 수행
    y_pred = linear_rg.predict(X_valid)

    # 검증 데이터로 성능 평가
    mse_valid = mean_squared_error(y_valid, y_pred)
    r2_valid = r2_score(y_valid, y_pred)
    print(f'Linear Regression - {type} scaling Validation MSE: {mse_valid:.4f}, R2 Score: {r2_valid:.4f}')

    # 테스트 데이터로 예측 수행
    y_pred_test = linear_rg.predict(X_test)

    # 최종 MSE, R2 score 출력
    mse_test = mean_squared_error(y_test, y_pred_test)
    r2_test = r2_score(y_test, y_pred_test)
    print(f'Linear Regression - {type} scaling Test MSE: {mse_test:.4f}, R2 Score: {r2_test:.4f}')

    return linear_rg

## 2. 스케일링 후 lgbm 모델 학습

### 1) standard scaler

In [None]:
X_train, X_test, y_train, y_test = scaler('standard', train, test)
stand_lgbm = lgbm_modeling(X_train, X_test, y_train, y_test, 'standard')

  3%|▎         | 1/37 [00:00<00:34,  1.06it/s]

Iteration 0: Test MSE = 0.0717


  5%|▌         | 2/37 [00:02<00:52,  1.51s/it]

Iteration 10000: Test MSE = 0.0663


  8%|▊         | 3/37 [00:03<00:44,  1.30s/it]

Iteration 20000: Test MSE = 0.0683


 11%|█         | 4/37 [00:07<01:11,  2.17s/it]

Iteration 30000: Test MSE = 0.0722


 14%|█▎        | 5/37 [00:19<03:09,  5.93s/it]

Iteration 40000: Test MSE = 0.0737


 16%|█▌        | 6/37 [00:20<02:08,  4.15s/it]

Iteration 50000: Test MSE = 0.0702


 19%|█▉        | 7/37 [00:21<01:35,  3.19s/it]

Iteration 60000: Test MSE = 0.0659


 22%|██▏       | 8/37 [00:23<01:17,  2.67s/it]

Iteration 70000: Test MSE = 0.0657


 24%|██▍       | 9/37 [00:28<01:31,  3.28s/it]

Iteration 80000: Test MSE = 0.0644


 27%|██▋       | 10/37 [00:29<01:16,  2.83s/it]

Iteration 90000: Test MSE = 0.0709


 30%|██▉       | 11/37 [00:31<01:06,  2.56s/it]

Iteration 100000: Test MSE = 0.0663


 32%|███▏      | 12/37 [00:34<01:06,  2.67s/it]

Iteration 110000: Test MSE = 0.0632


 35%|███▌      | 13/37 [00:36<00:57,  2.40s/it]

Iteration 120000: Test MSE = 0.0640


 38%|███▊      | 14/37 [00:38<00:53,  2.34s/it]

Iteration 130000: Test MSE = 0.0656


 41%|████      | 15/37 [00:41<00:56,  2.58s/it]

Iteration 140000: Test MSE = 0.0707


 43%|████▎     | 16/37 [00:43<00:49,  2.37s/it]

Iteration 150000: Test MSE = 0.0659


 46%|████▌     | 17/37 [00:44<00:38,  1.95s/it]

Iteration 160000: Test MSE = 0.0720


 49%|████▊     | 18/37 [00:46<00:33,  1.76s/it]

Iteration 170000: Test MSE = 0.0653


 51%|█████▏    | 19/37 [00:49<00:42,  2.36s/it]

Iteration 180000: Test MSE = 0.0680


 54%|█████▍    | 20/37 [00:52<00:43,  2.56s/it]

Iteration 190000: Test MSE = 0.0636


 57%|█████▋    | 21/37 [00:56<00:48,  3.03s/it]

Iteration 200000: Test MSE = 0.0631


 59%|█████▉    | 22/37 [00:59<00:44,  2.95s/it]

Iteration 210000: Test MSE = 0.0640


 62%|██████▏   | 23/37 [01:01<00:36,  2.59s/it]

Iteration 220000: Test MSE = 0.0654


 65%|██████▍   | 24/37 [01:02<00:26,  2.06s/it]

Iteration 230000: Test MSE = 0.0606


 68%|██████▊   | 25/37 [01:05<00:28,  2.40s/it]

Iteration 240000: Test MSE = 0.0610


 70%|███████   | 26/37 [01:09<00:31,  2.89s/it]

Iteration 250000: Test MSE = 0.0608


 73%|███████▎  | 27/37 [01:10<00:24,  2.42s/it]

Iteration 260000: Test MSE = 0.0678


 76%|███████▌  | 28/37 [01:12<00:19,  2.14s/it]

Iteration 270000: Test MSE = 0.0637


 78%|███████▊  | 29/37 [01:13<00:15,  1.94s/it]

Iteration 280000: Test MSE = 0.0655


 81%|████████  | 30/37 [01:15<00:13,  1.87s/it]

Iteration 290000: Test MSE = 0.0624


 84%|████████▍ | 31/37 [01:16<00:09,  1.57s/it]

Iteration 300000: Test MSE = 0.0709


 86%|████████▋ | 32/37 [01:18<00:09,  1.82s/it]

Iteration 310000: Test MSE = 0.0730


 89%|████████▉ | 33/37 [01:22<00:09,  2.29s/it]

Iteration 320000: Test MSE = 0.0657


 92%|█████████▏| 34/37 [01:25<00:07,  2.49s/it]

Iteration 330000: Test MSE = 0.0790


 95%|█████████▍| 35/37 [01:26<00:04,  2.28s/it]

Iteration 340000: Test MSE = 0.0673


 97%|█████████▋| 36/37 [01:28<00:02,  2.05s/it]

Iteration 350000: Test MSE = 0.0670


100%|██████████| 37/37 [01:30<00:00,  2.44s/it]

Iteration 360000: Test MSE = 0.1870
LGBM - standard scaling MSE: 0.1870, R2 Score: 0.8130





### 2) Min-Max scaler

In [None]:
X_train, X_test, y_train, y_test = scaler('minmax', train, test)
mm_lgbm = lgbm_modeling(X_train, X_test, y_train, y_test, 'minmax')

  3%|▎         | 1/37 [00:01<00:41,  1.16s/it]

Iteration 0: Test MSE = 0.0000


  5%|▌         | 2/37 [00:02<00:40,  1.15s/it]

Iteration 10000: Test MSE = 0.0000


  8%|▊         | 3/37 [00:03<00:35,  1.05s/it]

Iteration 20000: Test MSE = 0.0000


 11%|█         | 4/37 [00:04<00:35,  1.08s/it]

Iteration 30000: Test MSE = 0.0000


 14%|█▎        | 5/37 [00:05<00:37,  1.17s/it]

Iteration 40000: Test MSE = 0.0000


 16%|█▌        | 6/37 [00:06<00:32,  1.04s/it]

Iteration 50000: Test MSE = 0.0000


 19%|█▉        | 7/37 [00:07<00:26,  1.11it/s]

Iteration 60000: Test MSE = 0.0000


 22%|██▏       | 8/37 [00:08<00:33,  1.17s/it]

Iteration 70000: Test MSE = 0.0000


 24%|██▍       | 9/37 [00:14<01:07,  2.42s/it]

Iteration 80000: Test MSE = 0.0000


 27%|██▋       | 10/37 [00:15<01:00,  2.24s/it]

Iteration 90000: Test MSE = 0.0000


 30%|██▉       | 11/37 [00:16<00:49,  1.90s/it]

Iteration 100000: Test MSE = 0.0000


 32%|███▏      | 12/37 [00:19<00:49,  1.97s/it]

Iteration 110000: Test MSE = 0.0000


 35%|███▌      | 13/37 [00:24<01:11,  2.98s/it]

Iteration 120000: Test MSE = 0.0000


 38%|███▊      | 14/37 [00:25<00:57,  2.48s/it]

Iteration 130000: Test MSE = 0.0000


 41%|████      | 15/37 [00:28<00:53,  2.43s/it]

Iteration 140000: Test MSE = 0.0000


 43%|████▎     | 16/37 [00:28<00:40,  1.92s/it]

Iteration 150000: Test MSE = 0.0000


 46%|████▌     | 17/37 [00:29<00:31,  1.56s/it]

Iteration 160000: Test MSE = 0.0001


 49%|████▊     | 18/37 [00:30<00:26,  1.38s/it]

Iteration 170000: Test MSE = 0.0000


 51%|█████▏    | 19/37 [00:33<00:34,  1.89s/it]

Iteration 180000: Test MSE = 0.0000


 54%|█████▍    | 20/37 [00:38<00:47,  2.78s/it]

Iteration 190000: Test MSE = 0.0000


 57%|█████▋    | 21/37 [00:40<00:42,  2.66s/it]

Iteration 200000: Test MSE = 0.0000


 59%|█████▉    | 22/37 [00:42<00:37,  2.47s/it]

Iteration 210000: Test MSE = 0.0000


 62%|██████▏   | 23/37 [00:45<00:33,  2.42s/it]

Iteration 220000: Test MSE = 0.0000


 65%|██████▍   | 24/37 [00:47<00:30,  2.31s/it]

Iteration 230000: Test MSE = 0.0000


 68%|██████▊   | 25/37 [00:52<00:37,  3.12s/it]

Iteration 240000: Test MSE = 0.0000


 70%|███████   | 26/37 [00:55<00:35,  3.22s/it]

Iteration 250000: Test MSE = 0.0000


 73%|███████▎  | 27/37 [00:56<00:25,  2.55s/it]

Iteration 260000: Test MSE = 0.0000


 76%|███████▌  | 28/37 [01:00<00:25,  2.82s/it]

Iteration 270000: Test MSE = 0.0000


 78%|███████▊  | 29/37 [01:00<00:17,  2.13s/it]

Iteration 280000: Test MSE = 0.0000


 81%|████████  | 30/37 [01:01<00:12,  1.83s/it]

Iteration 290000: Test MSE = 0.0000


 84%|████████▍ | 31/37 [01:03<00:10,  1.67s/it]

Iteration 300000: Test MSE = 0.0000


 86%|████████▋ | 32/37 [01:04<00:07,  1.59s/it]

Iteration 310000: Test MSE = 0.0000


 89%|████████▉ | 33/37 [01:06<00:07,  1.79s/it]

Iteration 320000: Test MSE = 0.0000


 92%|█████████▏| 34/37 [01:15<00:11,  3.89s/it]

Iteration 330000: Test MSE = 0.0000


 95%|█████████▍| 35/37 [01:16<00:06,  3.10s/it]

Iteration 340000: Test MSE = 0.0000


 97%|█████████▋| 36/37 [01:19<00:02,  3.00s/it]

Iteration 350000: Test MSE = 0.0000


100%|██████████| 37/37 [01:22<00:00,  2.22s/it]

Iteration 360000: Test MSE = 0.0001
LGBM - minmax scaling MSE: 0.0001, R2 Score: 0.7869





### 3) Robust scaler

In [None]:
X_train, X_test, y_train, y_test = scaler('robust', train, test)
robust_lgbm = lgbm_modeling(X_train, X_test, y_train, y_test, 'robust')

  3%|▎         | 1/37 [00:00<00:25,  1.40it/s]

Iteration 0: Test MSE = 0.0900


  5%|▌         | 2/37 [00:04<01:24,  2.40s/it]

Iteration 10000: Test MSE = 0.0819


  8%|▊         | 3/37 [00:05<01:09,  2.05s/it]

Iteration 20000: Test MSE = 0.0848


 11%|█         | 4/37 [00:07<00:59,  1.80s/it]

Iteration 30000: Test MSE = 0.0939


 14%|█▎        | 5/37 [00:08<00:55,  1.72s/it]

Iteration 40000: Test MSE = 0.0954


 16%|█▌        | 6/37 [00:11<01:02,  2.00s/it]

Iteration 50000: Test MSE = 0.0898


 19%|█▉        | 7/37 [00:12<00:49,  1.66s/it]

Iteration 60000: Test MSE = 0.0863


 22%|██▏       | 8/37 [00:14<00:47,  1.64s/it]

Iteration 70000: Test MSE = 0.0840


 24%|██▍       | 9/37 [00:18<01:12,  2.59s/it]

Iteration 80000: Test MSE = 0.0819


 27%|██▋       | 10/37 [00:20<01:03,  2.34s/it]

Iteration 90000: Test MSE = 0.0918


 30%|██▉       | 11/37 [00:23<01:04,  2.48s/it]

Iteration 100000: Test MSE = 0.0825


 32%|███▏      | 12/37 [00:25<00:57,  2.29s/it]

Iteration 110000: Test MSE = 0.0843


 35%|███▌      | 13/37 [00:27<00:53,  2.22s/it]

Iteration 120000: Test MSE = 0.0824


 38%|███▊      | 14/37 [00:28<00:46,  2.02s/it]

Iteration 130000: Test MSE = 0.0857


 41%|████      | 15/37 [00:31<00:46,  2.13s/it]

Iteration 140000: Test MSE = 0.0896


 43%|████▎     | 16/37 [00:32<00:40,  1.91s/it]

Iteration 150000: Test MSE = 0.0844


 46%|████▌     | 17/37 [00:34<00:41,  2.07s/it]

Iteration 160000: Test MSE = 0.0923


 49%|████▊     | 18/37 [00:35<00:32,  1.71s/it]

Iteration 170000: Test MSE = 0.0846


 51%|█████▏    | 19/37 [00:39<00:42,  2.35s/it]

Iteration 180000: Test MSE = 0.0890


 54%|█████▍    | 20/37 [00:42<00:44,  2.61s/it]

Iteration 190000: Test MSE = 0.0767


 57%|█████▋    | 21/37 [00:44<00:36,  2.30s/it]

Iteration 200000: Test MSE = 0.0841


 59%|█████▉    | 22/37 [00:57<01:22,  5.47s/it]

Iteration 210000: Test MSE = 0.0802


 62%|██████▏   | 23/37 [01:00<01:07,  4.80s/it]

Iteration 220000: Test MSE = 0.0840


 65%|██████▍   | 24/37 [01:01<00:48,  3.74s/it]

Iteration 230000: Test MSE = 0.0791


 68%|██████▊   | 25/37 [01:07<00:53,  4.43s/it]

Iteration 240000: Test MSE = 0.0805


 70%|███████   | 26/37 [01:15<01:00,  5.49s/it]

Iteration 250000: Test MSE = 0.0758


 73%|███████▎  | 27/37 [01:16<00:41,  4.16s/it]

Iteration 260000: Test MSE = 0.0879


 76%|███████▌  | 28/37 [01:18<00:29,  3.33s/it]

Iteration 270000: Test MSE = 0.0796


 78%|███████▊  | 29/37 [01:18<00:20,  2.51s/it]

Iteration 280000: Test MSE = 0.0881


 81%|████████  | 30/37 [01:23<00:22,  3.16s/it]

Iteration 290000: Test MSE = 0.0761


 84%|████████▍ | 31/37 [01:24<00:14,  2.42s/it]

Iteration 300000: Test MSE = 0.0924


 86%|████████▋ | 32/37 [01:26<00:11,  2.21s/it]

Iteration 310000: Test MSE = 0.0946


 89%|████████▉ | 33/37 [01:31<00:12,  3.23s/it]

Iteration 320000: Test MSE = 0.0827


 92%|█████████▏| 34/37 [01:38<00:12,  4.18s/it]

Iteration 330000: Test MSE = 0.1003


 95%|█████████▍| 35/37 [01:39<00:06,  3.38s/it]

Iteration 340000: Test MSE = 0.0877


 97%|█████████▋| 36/37 [01:40<00:02,  2.61s/it]

Iteration 350000: Test MSE = 0.0868


100%|██████████| 37/37 [01:40<00:00,  2.73s/it]

Iteration 360000: Test MSE = 0.2472





LGBM - robust scaling MSE: 0.2472, R2 Score: 0.8090


## 3. linear regression 모델 학습

###1) Standard scaler

In [None]:
X_train, X_test, y_train, y_test = scaler('standard', train, test)
stand_linear = linear_modeling(X_train, X_test, y_train, y_test, 'standard')

Linear Regression - standard scaling Validation MSE: 0.0161, R2 Score: 0.9841
Linear Regression - standard scaling Test MSE: 0.0160, R2 Score: 0.9840


###2) MinMax scaler

In [None]:
X_train, X_test, y_train, y_test = scaler('minmax', train, test)
mm_linear = linear_modeling(X_train, X_test, y_train, y_test, 'minmax')

Linear Regression - minmax scaling Validation MSE: 0.0000, R2 Score: 0.9841
Linear Regression - minmax scaling Test MSE: 0.0000, R2 Score: 0.9772


###3) Robust scaler

In [None]:
X_train, X_test, y_train, y_test = scaler('robust', train, test)
robust_linear = linear_modeling(X_train, X_test, y_train, y_test, 'robust')

Linear Regression - robust scaling Validation MSE: 0.0202, R2 Score: 0.9841
Linear Regression - robust scaling Test MSE: 0.0207, R2 Score: 0.9840


## 4. lgbm 하이퍼 파라미터 튜닝

In [11]:
def lgbm_tuning(X_train, X_test, y_train, y_test, type):
    # 학습 데이터를 학습 및 검증 데이터로 나누기
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    # 그리드 서치 대상 파라미터 설정
    param_grid = {
        'num_iterations': [1000],
        'learning_rate': [0.1, 0.5],
        'max_depth': [3, 5, 7],
        'metric': ['l2'],
        'force_col_wise': [True],
        'n_jobs': [-1],
        'verbosity': [-1]
    }

    # LightGBM Regressor 생성
    lgbm = lgb.LGBMRegressor()

    # GridSearchCV를 사용하여 최적의 하이퍼파라미터 찾기
    grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)

    # 최적의 모델을 찾은 후 학습 데이터에 대해 다시 학습
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric='mse', early_stopping_rounds=10, verbose=False)

    # 테스트 데이터로 예측 수행
    y_pred = best_model.predict(X_test)

    # 최종 MSE, R2 score 출력
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'LGBM - {type} scaling MSE: {mse:.4f}, R2 Score: {r2:.4f}')

### 1) standard scaler




In [14]:
X_train, X_test, y_train, y_test = scaler('standard', train, test)
stand_lgbm = lgbm_tuning(X_train, X_test, y_train, y_test, 'standard')

Fitting 5 folds for each of 6 candidates, totalling 30 fits
LGBM - standard scaling MSE: 0.0250, R2 Score: 0.9750


### 2) Min-Max scaler

In [15]:
X_train, X_test, y_train, y_test = scaler('minmax', train, test)
mm_lgbm = lgbm_tuning(X_train, X_test, y_train, y_test, 'minmax')

Fitting 5 folds for each of 6 candidates, totalling 30 fits
LGBM - minmax scaling MSE: 0.0000, R2 Score: 0.9637


### 3) Robust scaler

In [16]:
X_train, X_test, y_train, y_test = scaler('robust', train, test)
robust_lgbm = lgbm_tuning(X_train, X_test, y_train, y_test, 'robust')

Fitting 5 folds for each of 6 candidates, totalling 30 fits
LGBM - robust scaling MSE: 0.0239, R2 Score: 0.9815


## 5. linear regression 하이퍼 파라미터 튜닝

In [17]:
def linear_tuning(X_train, X_test, y_train, y_test, type):
    # 학습 데이터를 학습 및 검증 데이터로 나눔
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    # Lasso 회귀 모델 생성
    lasso = Lasso()

    # 그리드 서치 대상 파라미터 설정
    param_grid = {
        'alpha': [0.1, 0.5, 1.0],
        'max_iter': [1000, 2000],
        'tol': [1e-3, 1e-4]
    }

    # GridSearchCV를 사용하여 최적의 하이퍼파라미터 찾기
    grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)

    # 최적의 모델을 찾은 후 학습 데이터에 대해 다시 학습
    best_lasso = grid_search.best_estimator_
    best_lasso.fit(X_train, y_train)

    # 검증 데이터로 예측 수행
    y_pred_valid = best_lasso.predict(X_valid)

    # 검증 데이터로 성능 평가
    mse_valid = mean_squared_error(y_valid, y_pred_valid)
    r2_valid = r2_score(y_valid, y_pred_valid)
    print(f'Lasso Regression - {type} scaling Validation MSE: {mse_valid:.4f}, R2 Score: {r2_valid:.4f}')

    # 테스트 데이터로 예측 수행
    y_pred_test = best_lasso.predict(X_test)

    # 최종 MSE, R2 score 출력
    mse_test = mean_squared_error(y_test, y_pred_test)
    r2_test = r2_score(y_test, y_pred_test)
    print(f'Lasso Regression - {type} scaling Test MSE: {mse_test:.4f}, R2 Score: {r2_test:.4f}')


###1) Standard scaler

In [18]:
X_train, X_test, y_train, y_test = scaler('standard', train, test)
stand_linear = linear_tuning(X_train, X_test, y_train, y_test, 'standard')

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Lasso Regression - standard scaling Validation MSE: 0.0354, R2 Score: 0.9650
Lasso Regression - standard scaling Test MSE: 0.0352, R2 Score: 0.9648


###2) MinMax scaler

In [19]:
X_train, X_test, y_train, y_test = scaler('minmax', train, test)
mm_linear = linear_tuning(X_train, X_test, y_train, y_test, 'minmax')

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Lasso Regression - minmax scaling Validation MSE: 0.0005, R2 Score: -0.0000
Lasso Regression - minmax scaling Test MSE: 0.0006, R2 Score: -0.0001


###3) Robust scaler

In [20]:
X_train, X_test, y_train, y_test = scaler('robust', train, test)
robust_linear = linear_tuning(X_train, X_test, y_train, y_test, 'robust')

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Lasso Regression - robust scaling Validation MSE: 0.0397, R2 Score: 0.9687
Lasso Regression - robust scaling Test MSE: 0.0397, R2 Score: 0.9693
