### 각 row별 학습-예측-평균

In [2]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 학습 모델 저장을 위한 라이브러리
import pickle

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

In [3]:
# data_type = "train"
# month = "07"
# category = "잔액정보"

# local
root_path = '../../data'

# colab
# root_path = '/content/drive/MyDrive/12조 파이널프로젝트/data'

#drive_folder = f'{root_path}/스탯티즈_Raw_data/'

In [61]:
# 1. 데이터 불러오기
train = pd.read_csv("../../data/KBO FA 투수 [2013-2024]_최종_전처리_ver03.csv")
test = pd.read_csv("../../data/2026_FA예정_투수.csv")

In [63]:
# 2. AAV 생성 (타겟)
train["AAV"] = train["FA 계약 총액"] / train["FA 계약 연수"]
train = train.dropna(subset=["AAV"])

In [65]:
# 3. 학습용 drop 컬럼
drop_cols = [
    '선수명', 'FA 계약 총액', 'FA 계약 연수', '구단명',
    'FA년도', 'FA등급', '비고1', '구분', '잔류 여부',
    '구종1', '구종2', '구종1 구종가치'
]

In [67]:
# 4. 범주형 변수 원핫 인코딩
cat_cols = ['투', '세부 포지션']
train = pd.get_dummies(train, columns=cat_cols)
test = pd.get_dummies(test, columns=cat_cols)

In [69]:
# 5. 컬럼 맞추기
# - train 기준 컬럼에 없는건 0으로 채움
for col in train.columns:
    if col not in test.columns and col != 'AAV':
        test[col] = 0
# - test 기준 여분 컬럼 제거하고 순서 맞추기
X = train.drop(columns=drop_cols + ['AAV'], errors='ignore')
test = test[X.columns]  # 동일한 피처만 사용
y = train['AAV']

In [89]:
# ✅ 5. 학습/검증 성능 점검
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

val_preds = model.predict(X_val)
rmse = mean_squared_error(y_val, val_preds, squared=False)
r2 = r2_score(y_val, val_preds)

print("🎯 [검증 성능 확인]")
print(f"✅ RMSE: {rmse:,.0f} 원")
print(f"✅ R²: {r2:.4f}")

🎯 [검증 성능 확인]
✅ RMSE: 36,842 원
✅ R²: 0.6620


In [91]:
# 7. 예측
test_preds = model.predict(test)

In [93]:
# 8. 결과 저장
test_result = pd.DataFrame({
    '선수명': pd.read_csv("../../data/2026_FA예정_투수.csv")['선수명'],  # 원본에서 이름 불러오기
    '예측_AAV': test_preds
})

In [99]:
test_result

Unnamed: 0,선수명,예측_AAV
0,조상우,73560.000000
1,조상우,105725.000000
2,조상우,100376.666667
3,조상우,84963.333333
4,이준영,55575.000000
...,...,...
59,장필준,39643.333333
60,서진용,71671.666667
61,서진용,109445.000000
62,서진용,80438.333333


In [95]:
# 9. 선수별 평균 AAV (선택)
avg_result = test_result.groupby('선수명', as_index=False)['예측_AAV'].mean()
avg_result = avg_result.sort_values(by='예측_AAV', ascending=False).reset_index(drop=True)

In [97]:
avg_result

Unnamed: 0,선수명,예측_AAV
0,양현종,210543.75
1,최원준,109235.833333
2,조상우,91156.25
3,서진용,84576.25
4,홍건희,84163.333333
5,이영하,77515.416667
6,심창민,65224.583333
7,진해수,64770.0
8,김상수,59374.583333
9,김태훈,55904.166667
