In [16]:
import pandas as pd
import numpy as np
import time
import os
from sklearn.model_selection import TimeSeriesSplit, train_test_split, GridSearchCV
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error


In [3]:
economic = pd.read_csv('경제변수.csv')
economic = economic.drop(['종가_x.2', 'us_tips_20', 'us_tips_60'], axis=1) # NaN 이 너무 많은 열(NaN값 500개 이상)
economic = economic.dropna()
economic

Unnamed: 0,Date,kospi,kospi_20,kospi_60,Close_x,wy_20,wy_60,Close_y,wd_20,wd_60,...,wti_60,종가_x.1,us_10_20,us_10_60,종가_y.1,us_2_20,us_2_60,종가_y.2,vix_20,vix_60
0,2016-06-14,197203.0,0.001753,-0.000081,9.061600,-0.000442,-0.001029,1170.150024,-0.001150,0.000457,...,0.005320,1.613,-0.004457,-0.000616,0.718,-0.006595,0.001283,20.50,0.002955,0.001110
1,2016-06-15,196883.0,0.001652,-0.000102,9.027700,-0.000712,-0.001053,1173.390015,-0.001227,0.000470,...,0.005304,1.613,-0.006240,-0.000717,0.726,-0.010132,0.001200,20.14,0.007898,0.001691
2,2016-06-16,195199.0,0.001525,-0.000114,9.055300,-0.000982,-0.001071,1166.699951,-0.001299,0.000480,...,0.005324,1.575,-0.007785,-0.000774,0.670,-0.012629,0.001221,19.37,0.012717,0.002175
3,2016-06-17,195340.0,0.001186,-0.000136,8.932600,-0.001180,-0.001095,1169.579956,-0.001271,0.000497,...,0.005253,1.579,-0.008771,-0.000831,0.689,-0.014734,0.001174,19.41,0.016699,0.002589
4,2016-06-20,198112.0,0.000781,-0.000158,8.992600,-0.001559,-0.001133,1168.560059,-0.001201,0.000517,...,0.005096,1.611,-0.009600,-0.000934,0.697,-0.016419,0.001102,18.37,0.020707,0.003074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1862,2024-01-08,256782.0,0.002743,0.001750,0.110490,-0.000634,-0.000502,1313.130005,-0.000136,-0.000752,...,-0.003622,4.051,-0.003349,-0.004575,4.393,-0.004490,-0.003431,13.08,0.005104,-0.008708
1863,2024-01-09,256124.0,0.002242,0.001784,0.109668,-0.000413,-0.000544,1311.800049,0.000251,-0.000700,...,-0.003464,4.027,-0.001917,-0.004607,4.375,-0.003403,-0.003454,12.76,0.004671,-0.008434
1864,2024-01-10,254198.0,0.001644,0.001832,0.109291,-0.000245,-0.000593,1319.790039,0.000592,-0.000654,...,-0.003385,4.015,-0.000465,-0.004611,4.366,-0.002301,-0.003461,12.69,0.004154,-0.008398
1865,2024-01-11,254027.0,0.001059,0.001849,0.110438,-0.000184,-0.000654,1317.500000,0.000981,-0.000590,...,-0.003261,4.030,0.000922,-0.004559,4.360,-0.001014,-0.003423,12.44,0.002855,-0.008280


In [4]:
# ETF 수익률 데이터 불러오기
etf_return = pd.read_csv('etf_return.csv')
etf_return.columns = etf_return.iloc[-1] # 맨 마지막 행이 etf명으로 되어있어서 그걸 열이름으로 지정
etf_return = etf_return[:-1] # 열이름으로 지정했으니 맨 마지막 행 삭제
etf_return = etf_return.rename(columns={'name': 'Date'}) # 날짜 열이름이 name으로 바뀌어서 열이름을 Date로 바꿔줌

# Date 열을 datetime 형식으로 변환
etf_return['Date'] = pd.to_datetime(etf_return['Date'], format='%Y-%m-%d %H:%M:%S')
# 날짜를 YYYY-MM-DD 형식으로 다시 저장
etf_return['Date'] = etf_return['Date'].dt.strftime('%Y-%m-%d')
etf_return

1846,Date,타이거로우볼,타이거모멘텀,타이거우량가치,타이거코스피인버스,코덱스코스피중소형주,코덱스밸류,코덱스퀄리티,코덱스배당성장,타이거고배당
0,2016-07-13,-0.01639878838012381,-0.02194083396173198,-0.005024442560878135,-0.0056249445194360885,-0.02446839072780061,-0.010779838487866282,-0.016414579511102298,-0.009413137057923291,-0.012211583976404079
1,2016-07-14,-0.009047075684900844,-0.025420201805643635,-0.0043132117299620335,-0.00045116174913706775,-0.033216050008530076,-0.01379994300463574,-0.015454551991681515,-0.0016629715584083264,-0.0030341953491737194
2,2016-07-15,0.007743288816803419,-0.035212873264550734,-0.0008370302656608219,0.008400548890493178,-0.009451037367300874,0.007226335192023949,0.0,0.017914805936952757,0.013263883640477356
3,2016-07-18,0.0023446669592540547,-0.03817699014427647,0.0015184323700295752,0.009527122077965812,-0.0019579056670573137,0.0019258551932142657,0.0011152002940105837,0.016573686162376602,0.006650745009484806
4,2016-07-19,-0.0165864906554479,-0.02268198430997555,-0.014768672996419886,-0.007906965401582953,-0.024083453271562066,-0.007435870193802551,-0.007985845457182703,-0.0019398648178265917,-0.0023285608379313358
...,...,...,...,...,...,...,...,...,...,...
1841,2024-01-08,0.002612104227924961,-0.0453726738995393,-0.004763393080456568,-0.02393431025905517,0.0012602396122877732,-0.0005842828094939124,0.027188435938205222,0.002929332010493066,-0.01942520306687064
1842,2024-01-09,-0.00650338299044036,-0.03744279748377625,-0.0010340193248885151,-0.02108174061879662,0.0029160612207138766,0.0,0.02181047257288884,0.00036569757218678635,-0.01791966034353947
1843,2024-01-10,-0.003917306423923485,-0.03227664043954287,-0.0026979366785167746,-0.012612779815698248,0.012547216052088556,0.007038152220261447,0.022975973870323973,0.0007334067068747537,-0.024927976998928132
1844,2024-01-11,-0.004357305368955701,-0.019705071079332444,-0.00020766275643098833,-0.012624153228396402,0.005438206249144007,0.007038152220261447,0.021227212130573596,-0.010993148450961429,-0.019354264896149026


In [5]:
# Date 열을 기준으로 두 데이터프레임을 필터링
economic = economic[economic['Date'].isin(etf_return['Date'])]
etf_return = etf_return[etf_return['Date'].isin(economic['Date'])]

#Date 미리 저장
date = etf_return['Date']

### 원본 수익률 필터링

In [6]:
etf_return.columns.to_list()[1:]

['타이거로우볼',
 '타이거모멘텀',
 '타이거우량가치',
 '타이거코스피인버스',
 '코덱스코스피중소형주',
 '코덱스밸류',
 '코덱스퀄리티',
 '코덱스배당성장',
 '타이거고배당']

In [10]:
#원본 수익률도 필터링
etf_price = pd.read_csv("etf_price.csv")
etf_price = etf_price.drop(etf_price.index[-1])
etf_price['Date'] = pd.to_datetime(etf_price['Date']).dt.strftime('%Y-%m-%d')
etf_price_filtered = etf_price[etf_price['Date'].isin(etf_return['Date'])]

# 'Date' 열을 인덱스로 설정
etf_price_filtered.set_index('Date', inplace=True)
etf_price_filtered.columns = etf_return.columns.to_list()[1:]

#저장
etf_price_filtered.to_csv("etf_price_filtered.csv")

In [11]:
# 경제 변수 데이터 불러오기
X = economic.drop(['Date'], axis=1)
# ETF 수익률 데이터 불러오기
y = etf_return.drop(['Date'], axis=1)

In [12]:
X.columns

Index(['kospi', 'kospi_20', 'kospi_60', 'Close_x', 'wy_20', 'wy_60', 'Close_y',
       'wd_20', 'wd_60', '종가_x', 'vkospi_20', 'vkospi_60', '종가_y', 'wti_20',
       'wti_60', '종가_x.1', 'us_10_20', 'us_10_60', '종가_y.1', 'us_2_20',
       'us_2_60', '종가_y.2', 'vix_20', 'vix_60'],
      dtype='object')

In [13]:
y.columns

Index(['타이거로우볼', '타이거모멘텀', '타이거우량가치', '타이거코스피인버스', '코덱스코스피중소형주', '코덱스밸류',
       '코덱스퀄리티', '코덱스배당성장', '타이거고배당'],
      dtype='object', name=1846)

In [14]:
etf_list = list(y.columns)
etf_list

['타이거로우볼',
 '타이거모멘텀',
 '타이거우량가치',
 '타이거코스피인버스',
 '코덱스코스피중소형주',
 '코덱스밸류',
 '코덱스퀄리티',
 '코덱스배당성장',
 '타이거고배당']

In [15]:
# import pandas as pd
# import numpy as np
# import time
# from sklearn.model_selection import TimeSeriesSplit, train_test_split, GridSearchCV
# from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import mean_squared_error


# for i, etf in enumerate(etf_list):
#     # 해당 ETF에 대한 y 데이터 추출
#     y_data = y[etf]

#     # validation 생성
#     X_train, X_val, y_train, y_val = train_test_split(X, y_data, test_size=0.2, shuffle=False)

#     # 데이터 파이프라인 설정
#     numeric_features = X.columns
#     numeric_transformer = Pipeline(steps=[
#         ('imputer', SimpleImputer(strategy='mean')),
#         ('scaler', StandardScaler())
#     ])

#     preprocessor = ColumnTransformer(
#         transformers=[
#             ('num', numeric_transformer, numeric_features)
#         ])

#     # 모델 초기화
#     rf_model = RandomForestRegressor()
#     gb_model = GradientBoostingRegressor()
#     xgb_model = XGBRegressor()
#     lgbm_model = LGBMRegressor(verbose=-1)

#     # Hyperparameter 튜닝을 위한 그리드 서치 설정
#     # RF default : n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1
#     rf_param_grid = {
#     'n_estimators': [50, 100],
#     'max_depth': [None, 10],
#     'min_samples_split': [2, 5],
#     'min_samples_leaf': [1, 2]
#     }

#     # GB default : n_estimators=100. learning_rate=0.1, max_depth=3, min_samples_split=2, min_samples_leaf=1
#     gb_param_grid = {
#     'n_estimators': [50, 100],
#     'learning_rate': [0.005, 0.01],
#     'max_depth': [3, 5],
#     'min_samples_split': [2, 5],
#     'min_samples_leaf': [1, 2]
#     }

#     # XGB default : n_estimators=100, learning_rate=0.1, max_depth=3, min_child_weight=1, gamma=0
#     xgb_param_grid = {
#     'n_estimators': [50, 100],
#     'learning_rate': [0.005, 0.01],
#     'max_depth': [3, 5],
#     'min_child_weight': [1, 2],
#     'gamma': [0, 0.1]
#     }

#     # LGBM default : n_estimators=100, learning_rate=0.1, max_depth=-1, min_child_samples=20, feature_fraction=1.0
#     lgbm_param_grid = {
#     'n_estimators': [50, 100],
#     'learning_rate': [0.005, 0.01],
#     'max_depth': [3, None],
#     'min_child_samples': [5,10],
#     'feature_fraction': [0.8, 1.0]
# }

#     # 각 모델에 대해 최적의 하이퍼파라미터 찾기
#     rf_grid_search = GridSearchCV(rf_model, rf_param_grid, cv=TimeSeriesSplit(n_splits=3), scoring='neg_mean_squared_error')
#     gb_grid_search = GridSearchCV(gb_model, gb_param_grid, cv=TimeSeriesSplit(n_splits=3), scoring='neg_mean_squared_error')
#     xgb_grid_search = GridSearchCV(xgb_model, xgb_param_grid, cv=TimeSeriesSplit(n_splits=3), scoring='neg_mean_squared_error')
#     lgbm_grid_search = GridSearchCV(lgbm_model, lgbm_param_grid, cv=TimeSeriesSplit(n_splits=3), scoring='neg_mean_squared_error')

#     # 파이프라인 설정 및 학습
#     rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', rf_grid_search)])
#     gb_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', gb_grid_search)])
#     xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', xgb_grid_search)])
#     lgbm_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', lgbm_grid_search)])

#    # 모델 학습
#     rf_start = time.time()
#     rf_pipeline.fit(X_train, y_train)
#     rf_end = time.time()

#     gb_start = time.time()
#     gb_pipeline.fit(X_train, y_train)
#     gb_end = time.time()

#     xgb_start = time.time()
#     xgb_pipeline.fit(X_train, y_train)
#     xgb_end = time.time()

#     lgbm_start = time.time()
#     lgbm_pipeline.fit(X_train, y_train)
#     lgbm_end = time.time()

#     # Validation set에 대한 성능 평가
#     rf_val_pred = rf_pipeline.predict(X_val)
#     gb_val_pred = gb_pipeline.predict(X_val)
#     xgb_val_pred = xgb_pipeline.predict(X_val)
#     lgbm_val_pred = lgbm_pipeline.predict(X_val)

#     rf_val_rmse = np.sqrt(mean_squared_error(y_val, rf_val_pred))
#     gb_val_rmse = np.sqrt(mean_squared_error(y_val, gb_val_pred))
#     xgb_val_rmse = np.sqrt(mean_squared_error(y_val, xgb_val_pred))
#     lgbm_val_rmse = np.sqrt(mean_squared_error(y_val, lgbm_val_pred))

#     print("<", etf, ">", f"[{i+1}/{len(etf_list)}]")
#     # 최적의 모델과 파라미터 출력
#     print("Random Forest 최적 모델:", rf_pipeline.named_steps['regressor'].best_estimator_)
#     print("Gradient Boosting 최적 모델:", gb_pipeline.named_steps['regressor'].best_estimator_)
#     # XGBoost의 경우에는 하이퍼파라미터 값들이 None인 것들도 모두 출력되어 출력창이 지저분해져서 출력방식을 바꿈
#     xgb_best_params = {key: value for key, value in xgb_pipeline.named_steps['regressor'].best_estimator_.get_params().items() if value is not None}
#     print("XGBoost 최적 모델: XGBRegressor({})".format(', '.join(f'{key}={repr(value)}' for key, value in xgb_best_params.items())))
#     print("LightGBM 최적 모델:", lgbm_pipeline.named_steps['regressor'].best_estimator_)
#     print("="*50)

#     # 모델별 실행 시간 출력
#     print(f"Random Forest 실행 시간: {rf_end - rf_start:.2f} 초")
#     print(f"Gradient Boosting 실행 시간: {gb_end - gb_start:.2f} 초")
#     print(f"XGBoost 실행 시간: {xgb_end - xgb_start:.2f} 초")
#     print(f"LightGBM 실행 시간: {lgbm_end - lgbm_start:.2f} 초")
#     print("="*50)

#     # 최적의 평가 지표(RMSE) 출력
#     rf_best_score = np.sqrt(-rf_pipeline.named_steps['regressor'].best_score_)
#     gb_best_score = np.sqrt(-gb_pipeline.named_steps['regressor'].best_score_)
#     xgb_best_score = np.sqrt(-xgb_pipeline.named_steps['regressor'].best_score_)
#     lgbm_best_score = np.sqrt(-lgbm_pipeline.named_steps['regressor'].best_score_)

#     print(f"Random Forest 최적 RMSE: {rf_val_rmse:.4f}")
#     print(f"Gradient Boosting 최적 RMSE: {gb_val_rmse:.4f}")
#     print(f"XGBoost 최적 RMSE: {xgb_val_rmse:.4f}")
#     print(f"LightGBM 최적 RMSE: {lgbm_val_rmse:.4f}")

#     print("=" * 50)
#     print()

In [19]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import TimeSeriesSplit, train_test_split, GridSearchCV
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

dic_pred = dict()
dic_rmse = dict()
for i, etf in enumerate(etf_list):
    # 해당 ETF에 대한 y 데이터 추출
    y_data = y[etf]

    # validation 생성
    X_train, X_val, y_train, y_val = train_test_split(X, y_data, test_size=0.2, shuffle=False)

    # 데이터 파이프라인 설정
    numeric_features = X.columns
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features)
        ])

    # 모델 초기화
    lgbm_model = LGBMRegressor(verbose=-1)

    # Hyperparameter 튜닝을 위한 그리드 서치 설정
    lgbm_param_grid = {
        'n_estimators': [80, 100, 120],
        'learning_rate': [0.02, 0.01],
        'max_depth': [5,8,None],
        'min_child_samples': [5,10],
        'feature_fraction': [0.8, 1.0],
        'boosting_type' : ['gbdt', 'goss'],
        'top_rate' : [0.2, 0,4, 0,6],
        'other_rate' : [0.1, 0.05]}
    

    # 그리드 서치 수행
    lgbm_grid_search = GridSearchCV(lgbm_model, lgbm_param_grid, cv=TimeSeriesSplit(n_splits=3), scoring='neg_mean_squared_error',
                                   verbose = 0)

    # 파이프라인 설정 및 학습
    lgbm_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', lgbm_grid_search)])

    # 모델 학습
    lgbm_start = time.time()
    lgbm_pipeline.fit(X_train, y_train)
    lgbm_end = time.time()

    # Validation set에 대한 성능 평가
    lgbm_val_pred = lgbm_pipeline.predict(X_val)
    lgbm_val_rmse = np.sqrt(mean_squared_error(y_val, lgbm_val_pred))

    print("<", etf, ">", f"[{i+1}/{len(etf_list)}]")
    # 최적의 모델과 파라미터 출력
    print("LightGBM 최적 모델:", lgbm_pipeline.named_steps['regressor'].best_estimator_)
    print("="*50)

    # 모델별 실행 시간 출력
    print(f"LightGBM 실행 시간: {lgbm_end - lgbm_start:.2f} 초")
    print("="*50)

    # 최적의 평가 지표(RMSE) 출력
    lgbm_best_score = np.sqrt(-lgbm_pipeline.named_steps['regressor'].best_score_)
    print(f"LightGBM train_RMSE: {lgbm_best_score:.4f}")
    print(f"LightGBM val_RMSE: {lgbm_val_rmse:.4f}")

    # Feature Importance 출력
    feature_importance = lgbm_pipeline.named_steps['regressor'].best_estimator_.feature_importances_
    feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
    print("LightGBM Feature Importance:")
    print(feature_importance_df)

    print("=" * 50)
    print()
        
    #예측 수익률 저장
    dic_pred[etf] = list(lgbm_pipeline.predict(X))
    
    #RMSE 저장
    dic_rmse[etf] = [lgbm_pipeline.named_steps['regressor'].best_params_, lgbm_best_score, lgbm_best_score]


< 타이거로우볼 > [1/9]
LightGBM 최적 모델: LGBMRegressor(feature_fraction=0.8, learning_rate=0.02, max_depth=None,
              min_child_samples=10, n_estimators=120, other_rate=0.1,
              top_rate=0.2, verbose=-1)
LightGBM 실행 시간: 212.37 초
LightGBM train_RMSE: 0.0453
LightGBM val_RMSE: 0.0319
LightGBM Feature Importance:
      Feature  Importance
1    kospi_20         308
0       kospi         212
14     wti_60         204
20    us_2_60         203
2    kospi_60         202
5       wy_60         199
16   us_10_20         176
6     Close_y         173
9        종가_x         167
23     vix_60         162
11  vkospi_60         157
17   us_10_60         155
18     종가_y.1         132
19    us_2_20         129
3     Close_x         129
21     종가_y.2         121
8       wd_60         120
4       wy_20         118
7       wd_20         117
12       종가_y         109
10  vkospi_20          92
13     wti_20          92
22     vix_20          62
15     종가_x.1          61

< 타이거모멘텀 > [2/9]
LightGBM 

In [25]:
pred_return = pd.DataFrame(dic_pred).set_index(date)
pred_return

Unnamed: 0_level_0,타이거로우볼,타이거모멘텀,타이거우량가치,타이거코스피인버스,코덱스코스피중소형주,코덱스밸류,코덱스퀄리티,코덱스배당성장,타이거고배당
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-07-13,-0.020089,-0.015202,0.001541,-0.007433,-0.014577,-0.011416,-0.014661,-0.001723,-0.002906
2016-07-14,-0.007949,-0.021910,0.002299,-0.008585,-0.015390,-0.011951,-0.012969,0.002115,-0.003052
2016-07-15,-0.005803,-0.021981,0.002209,-0.005030,-0.015327,-0.005889,-0.008287,0.004876,0.000197
2016-07-18,-0.008656,-0.023460,-0.000577,-0.006340,-0.014491,-0.006744,-0.007877,0.003846,-0.000014
2016-07-19,-0.012800,-0.023240,0.000098,-0.004966,-0.013653,-0.010860,-0.004841,0.002409,0.000437
...,...,...,...,...,...,...,...,...,...
2024-01-08,0.042782,-0.051681,0.014575,0.036510,0.034733,0.043896,0.031654,0.037932,0.021804
2024-01-09,0.037635,-0.037008,0.015007,0.028619,0.024861,0.049622,0.029742,0.037712,0.020127
2024-01-10,0.012259,-0.024017,0.008685,0.005971,0.011112,0.027453,0.021021,0.020736,0.000759
2024-01-11,-0.005955,-0.015470,-0.005883,-0.005339,-0.002694,0.002932,0.001657,0.011055,-0.003004


In [26]:
pred_return.to_csv("예상_수익률.csv")
