In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 학습 모델 저장을 위한 라이브러리
import pickle

In [27]:
df= pd.read_csv('투수_머신러닝용_데이터.csv')
df_pred = pd.read_csv('2026_FA예정_투수.csv')

In [3]:
# 학습이 완료된 모델을 저장할 파일 이름
best_model_path = 'model/regression_model_pitcher.dat'
# 교차검증 횟수
cv_count = 10
# 교차 검증
kfold = KFold(n_splits=cv_count, shuffle=True, random_state=42)

In [4]:
num_cols = ['이닝수', 'ERA', '종합 WAR', '탈삼진', 'FIP', '피장타율', '사사구', '투구수']
cat_cols = ['세부 포지션']

In [5]:
X = df[num_cols + cat_cols]
y = df['AAV']
y_log = np.log1p(y)  # log(1 + y)

In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
import numpy as np
from catboost import CatBoostRegressor
from sklearn.pipeline import make_pipeline

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
], remainder='passthrough')

# 학습

### LightGBM 

In [16]:
# 모델 정의
model_lgbm = make_pipeline(
    preprocessor,
    LGBMRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, random_state=42)
)

# 로그 스케일에서 교차 검증
r_lgbm = cross_val_score(model_lgbm, X, y_log, scoring='neg_mean_squared_error', cv=kfold)
rmse_lgbm = np.sqrt(-r_lgbm)
print("LightGBM 평균 RMSE (로그 스케일):", rmse_lgbm.mean())

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000083 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 592
[LightGBM] [Info] Number of data points in the train set: 252, number of used features: 11
[LightGBM] [Info] Start training from score 11.091419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000310 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 594
[LightGBM] [Info] Number of data points in the train set: 253, number of used features: 11
[LightGBM] [Info] Start training from score 11.157156
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000076 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 590
[LightGBM] [Info] Number of data points in the train set: 253, number of used features: 11
[LightGBM] [Info] Start training f

In [18]:
# 전체 데이터로 재학습 후 복원 평가
model_lgbm.fit(X, y_log)
y_pred_log_lgbm = model_lgbm.predict(X)
y_pred_lgbm = np.expm1(y_pred_log_lgbm)

rmse_lgbm_real = mean_squared_error(y, y_pred_lgbm, squared=False)
mape_lgbm = mean_absolute_percentage_error(y, y_pred_lgbm)

print("LightGBM RMSE (원래 스케일):", rmse_lgbm_real)
print("LightGBM MAPE:", mape_lgbm)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000136 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 648
[LightGBM] [Info] Number of data points in the train set: 281, number of used features: 11
[LightGBM] [Info] Start training from score 11.124444
LightGBM RMSE (원래 스케일): 31437.680675372187
LightGBM MAPE: 0.2875052759087836


### XGBoost

In [14]:
# 2. 모델 정의
model_xgb = make_pipeline(preprocessor, XGBRegressor(
    n_estimators=100, max_depth=4, learning_rate=0.1, random_state=42
))

# 3. 교차검증 → 로그 스케일 RMSE
r1 = cross_val_score(model_xgb, X, y_log, scoring='neg_mean_squared_error', cv=kfold)
rmse_log = np.sqrt(-r1)
print("XGBoost 평균 RMSE (로그 스케일):", rmse_log.mean())

XGBoost 평균 RMSE (로그 스케일): 0.6192098200247101


In [15]:
model_xgb.fit(X, y_log)               # 전체 데이터로 다시 학습
y_pred_log = model_xgb.predict(X)     # 로그 스케일 예측
y_pred = np.expm1(y_pred_log)         # 원래 스케일 복원

# 5. 원래 타깃도 복원 필요 없음 → y 그대로 사용
rmse = mean_squared_error(y, y_pred, squared=False)
mape = mean_absolute_percentage_error(y, y_pred)

print("XGBoost RMSE (원래 스케일):", rmse)
print("XGBoost MAPE:", mape)

XGBoost RMSE (원래 스케일): 18060.71574107283
XGBoost MAPE: 0.14058696854548905


### CatBoost

In [19]:
model_cat = make_pipeline(preprocessor,CatBoostRegressor(iterations=100, depth=4, learning_rate=0.1,random_state=42, verbose=0))


# 로그 스케일에서 교차 검증
r_cat = cross_val_score(model_cat, X, y_log, scoring='neg_mean_squared_error', cv=kfold)
rmse_cat = np.sqrt(-r_cat)
print("CatBoost 평균 RMSE (로그 스케일):", rmse_cat.mean())

CatBoost 평균 RMSE (로그 스케일): 0.5730500059385283


In [21]:
# 전체 데이터로 재학습 후 복원 평가
model_cat.fit(X, y_log)
y_pred_log_cat = model_cat.predict(X)
y_pred_cat = np.expm1(y_pred_log_cat)

rmse_cat_real = mean_squared_error(y, y_pred_cat, squared=False)
mape_cat = mean_absolute_percentage_error(y, y_pred_cat)

print("CatBoost RMSE (원래 스케일):", rmse_cat_real)
print("CatBoost MAPE:", mape_cat)

CatBoost RMSE (원래 스케일): 36512.815162593906
CatBoost MAPE: 0.3436321448871371


### RandomForest

In [25]:
model_rf = make_pipeline(preprocessor,RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42))

# 로그 스케일에서 교차 검증
r_rf = cross_val_score(model_rf, X, y_log, scoring='neg_mean_squared_error', cv=kfold)
rmse_rf = np.sqrt(-r_rf)
print("RandomForest 평균 RMSE (로그 스케일):", rmse_rf.mean())                      

RandomForest 평균 RMSE (로그 스케일): 0.5808866684399308


In [26]:
# 전체 데이터로 재학습 후 복원 평가
model_rf.fit(X, y_log)
y_pred_log_rf = model_rf.predict(X)
y_pred_rf = np.expm1(y_pred_log_rf)

rmse_rf_real = mean_squared_error(y, y_pred_rf, squared=False)
mape_rf = mean_absolute_percentage_error(y, y_pred_rf)

print("RandomForest RMSE (원래 스케일):", rmse_rf_real)
print("RandomForest MAPE:", mape_rf)

RandomForest RMSE (원래 스케일): 35103.88118200509
RandomForest MAPE: 0.33406329455888584


# 예측

### LightGBM

In [46]:
X_new = df_pred[num_cols + cat_cols]

y_new_pred_lgbm = model_lgbm.predict(X_new)
y_new_pred = np.expm1(y_new_pred_lgbm)


df_pred["예측_AAV_lgbm"] = y_new_pred

In [49]:
player_mean_pred = df_pred.groupby("선수명")["예측_AAV_lgbm"].mean().reset_index()

# 결과 출력
print(player_mean_pred.round())

    선수명  예측_AAV_lgbm
0   김범수      47155.0
1   김상수      41684.0
2   김태훈      39176.0
3   서진용      46909.0
4   심창민      42692.0
5   양현종     178116.0
6   이승현      53267.0
7   이영하      48727.0
8   이준영      40335.0
9   장필준      33610.0
10  조상우      65681.0
11  진해수      37364.0
12  최동환      40682.0
13  최성훈      38458.0
14  최원준      73851.0
15  홍건희      54872.0


### XGBoost

In [45]:
y_new_pred_log = model_xgb.predict(X_new)
y_new_pred = np.expm1(y_new_pred_log)

df_pred["예측_AAV_xgb"] = y_new_pred

In [50]:
player_mean_pred = df_pred.groupby("선수명")["예측_AAV_xgb"].mean().reset_index()

# 결과 출력
print(player_mean_pred.round())

    선수명  예측_AAV_xgb
0   김범수     46682.0
1   김상수     37754.0
2   김태훈     37886.0
3   서진용     40144.0
4   심창민     37785.0
5   양현종    169518.0
6   이승현     47612.0
7   이영하     48897.0
8   이준영     37802.0
9   장필준     35833.0
10  조상우     56644.0
11  진해수     39517.0
12  최동환     33659.0
13  최성훈     40099.0
14  최원준     70825.0
15  홍건희     60674.0


### CatBoost

In [44]:
y_new_pred_cat = model_cat.predict(X_new)
y_new_pred = np.expm1(y_new_pred_cat)

df_pred["예측_AAV_cat"] = y_new_pred

In [51]:
player_mean_pred = df_pred.groupby("선수명")["예측_AAV_cat"].mean().reset_index()

# 결과 출력
print(player_mean_pred.round())

    선수명  예측_AAV_cat
0   김범수     46348.0
1   김상수     42171.0
2   김태훈     38385.0
3   서진용     53493.0
4   심창민     37711.0
5   양현종    160546.0
6   이승현     41514.0
7   이영하     47435.0
8   이준영     41742.0
9   장필준     36306.0
10  조상우     62983.0
11  진해수     43375.0
12  최동환     40226.0
13  최성훈     39887.0
14  최원준     67454.0
15  홍건희     62330.0


### RandomForest

In [47]:
y_new_pred_rf = model_rf.predict(X_new)
y_new_pred = np.expm1(y_new_pred_rf)


df_pred["예측_AAV_rf"] = y_new_pred

In [52]:
player_mean_pred = df_pred.groupby("선수명")["예측_AAV_rf"].mean().reset_index()

# 결과 출력
print(player_mean_pred.round())

    선수명  예측_AAV_rf
0   김범수    48128.0
1   김상수    45115.0
2   김태훈    41490.0
3   서진용    53382.0
4   심창민    38958.0
5   양현종   155879.0
6   이승현    45931.0
7   이영하    50622.0
8   이준영    44303.0
9   장필준    34385.0
10  조상우    59823.0
11  진해수    41662.0
12  최동환    39724.0
13  최성훈    40038.0
14  최원준    68965.0
15  홍건희    63584.0
