In [None]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import r2_score, make_scorer
import lightgbm as lgb
import pandas as pd


# Y data
firm=pd.read_csv('C:/Users/DaBin/Desktop/대학원/2024-2/통계계산특론/PROJECT/data/DATA/firm_500.csv')
y=firm[['ret','year']]


# X train + X val : 1997 ~ 2016

X_data = pd.read_csv('C:/Users/DaBin/Desktop/대학원/2024-2/통계계산특론/PROJECT/data/DATA/PCA10F_UA10E.csv')
X_data = X_data[(X_data['year'] >= 1997) & (X_data['year'] <= 2016)].drop(columns=['year', 'month'])
y_data =y[(y['year'] >= 1997) & (y['year'] <= 2016)]['ret']


# 하이퍼파라미터 그리드 정의
param_grid = {
    'n_estimators': [100,200],
    'max_depth': [4, 8],
    'learning_rate': [0.01, 0.1],
    'num_leaves': [50,100],       #트리의 최대 노드 갯수 : 2^(max_depth) 보다 작거나 같은 값
}

# TimeSeriesSplit 정의
tscv = TimeSeriesSplit(n_splits=5)

#모델 정의 : LGBM
model = lgb.LGBMRegressor(random_state = 42,verbose = -1)

# Adjusted R-squared 계산 함수
def adjusted_r2_score(y_true, y_pred, n, p):
    r2 = r2_score(y_true, y_pred)
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)


# Custom Scorer 생성 함수
def adj_r2(estimator, X, y_true):
    n, p = X.shape
    pred = estimator.predict(X)
    return 1 - ((1 - r2_score(y_true, pred)) * (n - 1))/(n-p-1)

# Scorer를 make_scorer로 생성
scorer = make_scorer(adj_r2, greater_is_better=True)


# GridSearchCV 실행
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=tscv,              # TimeSeriesSplit 사용
    scoring = {
       "adj R2": adj_r2  
    },
    refit="adj R2"
)


grid_search.fit(X_data, y_data)

# 최적의 하이퍼파라미터 출력
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 100, 'num_leaves': 100}
Best Score: 0.18443351961547397
