In [1]:
import os

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from darts import TimeSeries
from darts.models import NLinearModel
from darts.dataprocessing.transformers import Scaler
from darts.metrics import mape
from darts.utils.timeseries_generation import datetime_attribute_timeseries
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from torchmetrics import MetricCollection, MeanAbsoluteError, MeanSquaredError
import torch

torch.set_float32_matmul_precision('high')

In [59]:
# base_path = 'sample_csv/'
base_path = 'stock_csv/'

column_mapping = {
    '일자': 'date',
    '종가': 'closing_price',
    '대비': 'price_difference',
    '등락률': 'fluctuation_rate',
    '시가': 'opening_price',
    '고가': 'highest_price',
    '저가': 'lowest_price',
    '거래량': 'trading_volume',
    '거래대금': 'trading_value',
    '시가총액': 'market_capitalization',
    '상장주식수': 'listed_shares',
}

In [73]:
stock_df = pd.read_excel(os.path.join(base_path, '005930_삼성전자_20000101_20230331.csv')).rename(columns=column_mapping)

stock_df.head()

Unnamed: 0,date,closing_price,price_difference,fluctuation_rate,opening_price,highest_price,lowest_price,trading_volume,trading_value,market_capitalization,listed_shares
0,2023/03/31,64000,800,1.27,64000,64000,63700,14094479,900711556583,382066083200000,5969782550
1,2023/03/30,63200,500,0.8,63700,63700,63100,15684377,993903387442,377290257160000,5969782550
2,2023/03/29,62700,-200,-0.32,62500,62700,62200,11216008,700455977590,374305365885000,5969782550
3,2023/03/28,62900,800,1.29,62400,62900,62100,11614118,726295104112,375499322395000,5969782550
4,2023/03/27,62100,-900,-1.43,62600,62800,62000,11039331,687170094400,370723496355000,5969782550


과거 공변량:
<ol>
    <li>과거 주가 데이터: 종가, 시가, 고가, 저가 등
    <li>거래량: 주식 거래량 데이터
    <li>기술적 지표: 이동 평균, RSI (상대 강도 지수), MACD (이동 평균 수렴 발산) 등
    <li>기업 재무 데이터: 매출, 이익, 자산, 부채 등
    <li>배당: 배당금과 배당 수익률
    <li>주가 수익률 (P/E) 등의 가치 지표
</ol>
<br>
미래 공변량:
<ol>
    <li>경제 성장률: 국내 총생산(GDP) 성장률 등
    <li>인플레이션: 소비자 물가지수(CPI) 등
    <li>기준 금리: 중앙은행의 기준 금리
    <li>실업률: 국가의 실업률 데이터
    <li>환율: 외환 시장의 환율 정보
    <li>경제 정책: 정부의 경제 정책이나 중앙은행의 통화 정책
</ol>

In [75]:
past_cov_list = ['price_difference', 'fluctuation_rate', 'opening_price', 'highest_price', 'lowest_price', 'trading_volume',
                'trading_value', 'market_capitalization', 'listed_shares']
# past_cov_list = []

def getScaledPastCov(timeseries):
    past_cov_timeseries = timeseries.copy()
    cov_year = datetime_attribute_timeseries(past_cov_timeseries, attribute='year')
    cov_month = datetime_attribute_timeseries(past_cov_timeseries, attribute='month')
    past_cov = cov_year.stack(cov_month)
    for col in past_cov_list:
        past_cov = past_cov.stack(past_cov_timeseries[col])
    return Scaler().fit_transform(past_cov)

def getFutureCov(timeseries):
    pass

In [None]:
"""
    아래 코드의 속도 개선 중
"""
import time
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')
start_time = time.time()


start_date = '2015-01-01'
end_date = '2023-03-31'

interpolate_list = ['closing_price'] + past_cov_list

list_stock_timeseries = []

list_train_series_scaled = []
list_val_series_scaled = []
list_scaler = []

list_scaled_past_cov = []
list_train_scaled_past_cov = []
list_val_scaled_past_cov = []

list_scaled_future_cov = []
list_train_scaled_future_cov = []
list_val_scaled_future_cov = []

for i, file_name in enumerate(os.listdir(base_path)):
    print(i, file_name)
    df = pd.read_excel(base_path + file_name).rename(columns=column_mapping)
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)
    if len(df.loc[:start_date]) == 0: # start_date 이전 날짜 데이터가 없는 경우는 무시. 따라서 start_date 이후 상장된 주식은 훈련 X
        continue
    df = df.loc[start_date:end_date]
    # 9.68초
    
    # Date interpolate
    stock_timeseries_df_interpolated = df.copy()
    stock_timeseries_df_interpolated = stock_timeseries_df_interpolated.asfreq('B')
    for col in interpolate_list:
        stock_timeseries_df_interpolated[col] = stock_timeseries_df_interpolated[col].interpolate(method='linear')
    df = stock_timeseries_df_interpolated
    # 25.73초
    
    # 전처리 결과를 저장
#     df.to_excel('interpolated_' + base_path + file_name)
    
    timeseries = TimeSeries.from_dataframe(df)
    # 40.93초
    
    
    # Darts TimeSeries 변환 및 날짜 필터링
    stock_timeseries = timeseries['closing_price']
    list_stock_timeseries.append(stock_timeseries)

    # Past covariant 및 날짜 필터링
    scaled_past_cov_timeseries = getScaledPastCov(timeseries)
    list_scaled_past_cov.append(scaled_past_cov_timeseries)
    # 159.89초
    
    # Split train/validation set
    train_percentage = 0.8
    split_index = int(train_percentage * len(stock_timeseries))
    
    train_series, val_series = stock_timeseries[:split_index], stock_timeseries[split_index:]
    train_past_cov_series, val_past_cov_series = scaled_past_cov_timeseries[:split_index], scaled_past_cov_timeseries[split_index:]
    
    list_train_scaled_past_cov.append(train_past_cov_series)
    list_val_scaled_past_cov.append(val_past_cov_series)
    
    # Scale
    scaler = Scaler()
    train_series_scaled = scaler.fit_transform(train_series)
    val_series_scaled = scaler.transform(val_series)
    list_scaler.append(scaler)
    
    list_train_series_scaled.append(train_series_scaled)
    list_val_series_scaled.append(val_series_scaled)

print(time.time() - start_time)
# Reset warning filter
warnings.filterwarnings('default')

0 217190_제너셈_20000101_20230331.csv
1 086670_비엠티_20000101_20230331.csv
2 271850_다이오진_20000101_20230331.csv
3 036200_유니셈_20000101_20230331.csv
4 302550_리메드_20000101_20230331.csv
5 053300_한국정보인증_20000101_20230331.csv
6 049830_승일_20000101_20230331.csv
7 006890_태경케미컬_20000101_20230331.csv
8 065710_서호전기_20000101_20230331.csv
9 013520_화승코퍼레이션_20000101_20230331.csv
10 054800_아이디스홀딩스_20000101_20230331.csv
11 199800_툴젠_20000101_20230331.csv
12 083790_크리스탈지노믹스_20000101_20230331.csv
13 005390_신성통상_20000101_20230331.csv
14 013310_아진산업_20000101_20230331.csv
15 005180_빙그레_20000101_20230331.csv
16 007700_F&F홀딩스_20000101_20230331.csv
17 251370_와이엠티_20000101_20230331.csv
18 004080_신흥_20000101_20230331.csv
19 192650_드림텍_20000101_20230331.csv
20 214270_FSN_20000101_20230331.csv
21 035290_골드앤에스_20000101_20230331.csv
22 288620_에스퓨얼셀_20000101_20230331.csv
23 442770_IBKS제21호스팩_20000101_20230331.csv
24 123890_한국자산신탁_20000101_20230331.csv
25 303530_이노뎁_20000101_20230331.csv
26 204620_글로벌텍스프리_20000101_20230331.c

215 001740_SK네트웍스_20000101_20230331.csv
216 002760_보락_20000101_20230331.csv
217 28513K_SK케미칼우_20000101_20230331.csv
218 033170_시그네틱스_20000101_20230331.csv
219 060370_KT서브마린_20000101_20230331.csv
220 348840_데이드림엔터_20000101_20230331.csv
221 100790_미래에셋벤처투자_20000101_20230331.csv
222 109960_에이프로젠 H&G_20000101_20230331.csv
223 048470_대동스틸_20000101_20230331.csv
224 104040_대성파인텍_20000101_20230331.csv
225 232140_와이아이케이_20000101_20230331.csv
226 037560_LG헬로비전_20000101_20230331.csv
227 212310_휴벡셀_20000101_20230331.csv
228 078140_대봉엘에스_20000101_20230331.csv
229 044060_조광ILI_20000101_20230331.csv
230 086790_하나금융지주_20000101_20230331.csv
231 293780_압타바이오_20000101_20230331.csv
232 084990_헬릭스미스_20000101_20230331.csv
233 353490_미래에셋대우스팩 5호_20000101_20230331.csv
234 007210_벽산_20000101_20230331.csv
235 267980_매일유업_20000101_20230331.csv
236 015230_대창단조_20000101_20230331.csv
237 257990_나우코스_20000101_20230331.csv
238 138080_오이솔루션_20000101_20230331.csv
239 163560_동일고무벨트_20000101_20230331.csv
240 003310_대주산업_

427 088910_동우팜투테이블_20000101_20230331.csv
428 080010_이상네트웍스_20000101_20230331.csv
429 058630_엠게임_20000101_20230331.csv
430 133820_화인베스틸_20000101_20230331.csv
431 002410_범양건영_20000101_20230331.csv
432 294140_레몬_20000101_20230331.csv
433 150440_피노텍_20000101_20230331.csv
434 053030_바이넥스_20000101_20230331.csv
435 148930_에이치와이티씨_20000101_20230331.csv
436 145020_휴젤_20000101_20230331.csv
437 185750_종근당_20000101_20230331.csv
438 032790_비엔지티_20000101_20230331.csv
439 094970_제이엠티_20000101_20230331.csv
440 007330_푸른저축은행_20000101_20230331.csv
441 084695_대상홀딩스우_20000101_20230331.csv
442 215790_이노인스트루먼트_20000101_20230331.csv
443 006730_서부T&D_20000101_20230331.csv
444 212560_네오오토_20000101_20230331.csv
445 215000_골프존_20000101_20230331.csv
446 095660_네오위즈_20000101_20230331.csv
447 018470_조일알미늄_20000101_20230331.csv
448 129890_앱코_20000101_20230331.csv
449 220100_퓨쳐켐_20000101_20230331.csv
450 263810_상신전자_20000101_20230331.csv
451 033560_블루콤_20000101_20230331.csv
452 002785_진흥기업우B_20000101_20230331.csv
453

639 226340_본느_20000101_20230331.csv
640 430700_유안타제9호스팩_20000101_20230331.csv
641 073640_테라사이언스_20000101_20230331.csv
642 117580_대성에너지_20000101_20230331.csv
643 079550_LIG넥스원_20000101_20230331.csv
644 036630_세종텔레콤_20000101_20230331.csv
645 092730_네오팜_20000101_20230331.csv
646 254490_미래반도체_20000101_20230331.csv
647 271560_오리온_20000101_20230331.csv
648 432320_KB스타리츠_20000101_20230331.csv
649 267060_명진홀딩스_20000101_20230331.csv
650 234070_에이원알폼_20000101_20230331.csv
651 068760_셀트리온제약_20000101_20230331.csv
652 145995_삼양사우_20000101_20230331.csv
653 219420_링크제니시스_20000101_20230331.csv
654 019770_서연탑메탈_20000101_20230331.csv
655 052860_아이앤씨_20000101_20230331.csv
656 030000_제일기획_20000101_20230331.csv
657 079000_와토스코리아_20000101_20230331.csv
658 077500_유니퀘스트_20000101_20230331.csv
659 174900_앱클론_20000101_20230331.csv
660 034220_LG디스플레이_20000101_20230331.csv
661 001390_KG케미칼_20000101_20230331.csv
662 079170_한창산업_20000101_20230331.csv
663 023800_인지컨트롤스_20000101_20230331.csv
664 227610_아우딘퓨쳐스_20000101

851 018620_우진비앤지_20000101_20230331.csv
852 002170_삼양통상_20000101_20230331.csv
853 011810_STX_20000101_20230331.csv
854 005385_현대차우_20000101_20230331.csv
855 042500_링네트_20000101_20230331.csv
856 236030_씨알푸드_20000101_20230331.csv
857 003470_유안타증권_20000101_20230331.csv
858 435620_하나금융25호스팩_20000101_20230331.csv
859 370090_퓨런티어_20000101_20230331.csv
860 228340_동양파일_20000101_20230331.csv
861 006880_신송홀딩스_20000101_20230331.csv
862 025560_미래산업_20000101_20230331.csv
863 407400_꿈비_20000101_20230331.csv
864 218410_RFHIC_20000101_20230331.csv
865 139130_DGB금융지주_20000101_20230331.csv
866 310200_애니플러스_20000101_20230331.csv
867 403360_라피치_20000101_20230331.csv
868 036420_콘텐트리중앙_20000101_20230331.csv
869 007390_네이처셀_20000101_20230331.csv
870 009730_코센_20000101_20230331.csv
871 204020_그리티_20000101_20230331.csv
872 131970_두산테스나_20000101_20230331.csv
873 003690_코리안리_20000101_20230331.csv
874 241840_에이스토리_20000101_20230331.csv
875 003850_보령_20000101_20230331.csv
876 123700_SJM_20000101_20230331.csv
877 03

1060 061250_화일약품_20000101_20230331.csv
1061 056730_CNT85_20000101_20230331.csv
1062 008110_대동전자_20000101_20230331.csv
1063 078020_이베스트투자증권_20000101_20230331.csv
1064 001380_SG글로벌_20000101_20230331.csv
1065 214390_경보제약_20000101_20230331.csv
1066 001745_SK네트웍스우_20000101_20230331.csv
1067 003720_삼영화학_20000101_20230331.csv
1068 092440_기신정기_20000101_20230331.csv
1069 136540_윈스_20000101_20230331.csv
1070 005430_한국공항_20000101_20230331.csv
1071 267250_HD현대_20000101_20230331.csv
1072 900310_컬러레이_20000101_20230331.csv
1073 011300_성안_20000101_20230331.csv
1074 012205_계양전기우_20000101_20230331.csv
1075 255220_SG_20000101_20230331.csv
1076 08537M_루트로닉3우C_20000101_20230331.csv
1077 111770_영원무역_20000101_20230331.csv
1078 389140_포바이포_20000101_20230331.csv
1079 222160_바이옵트로_20000101_20230331.csv
1080 008355_남선알미우_20000101_20230331.csv
1081 327610_펨토바이오메드_20000101_20230331.csv
1082 014910_성문전자_20000101_20230331.csv
1083 021880_메이슨캐피탈_20000101_20230331.csv
1084 034950_한국기업평가_20000101_20230331.csv
1085 9501

1266 072950_빛샘전자_20000101_20230331.csv
1267 394280_오픈엣지테크놀로지_20000101_20230331.csv
1268 024830_세원물산_20000101_20230331.csv
1269 043100_솔고바이오_20000101_20230331.csv
1270 009540_한국조선해양_20000101_20230331.csv
1271 004560_현대비앤지스틸_20000101_20230331.csv
1272 011700_한신기계_20000101_20230331.csv
1273 017650_대림제지_20000101_20230331.csv
1274 044480_블레이드 Ent_20000101_20230331.csv
1275 377330_이지트로닉스_20000101_20230331.csv
1276 024800_유성티엔에스_20000101_20230331.csv
1277 000325_노루홀딩스우_20000101_20230331.csv
1278 145720_덴티움_20000101_20230331.csv
1279 107590_미원홀딩스_20000101_20230331.csv
1280 029960_코엔텍_20000101_20230331.csv
1281 096240_크레버스_20000101_20230331.csv
1282 015540_쎌마테라퓨틱스_20000101_20230331.csv
1283 343090_HLB사이언스_20000101_20230331.csv
1284 180060_탑선_20000101_20230331.csv
1285 024110_기업은행_20000101_20230331.csv
1286 052460_아이크래프트_20000101_20230331.csv
1287 196700_웹스_20000101_20230331.csv
1288 130580_나이스디앤비_20000101_20230331.csv
1289 040350_큐로컴_20000101_20230331.csv
1290 185190_수프로_20000101_20230331.csv
1

In [None]:
interpolate_list = ['closing_price'] + past_cov_list

# 필터링할 날짜 범위 설정
start_date = pd.to_datetime('2015-01-01')
end_date = pd.to_datetime('2023-03-31')

list_stock_timeseries = []

list_train_series_scaled = []
list_val_series_scaled = []
list_scaler = []

list_scaled_past_cov = []
list_train_scaled_past_cov = []
list_val_scaled_past_cov = []

list_scaled_future_cov = []
list_train_scaled_future_cov = []
list_val_scaled_future_cov = []

for i, file_name in enumerate(os.listdir(base_path)):
    print(i, file_name)
    df = pd.read_excel(base_path + file_name).rename(columns=column_mapping)
    if len(df) < 100:
        continue
    df['date'] = pd.to_datetime(df['date'])
    
    # Date interpolate
    stock_timeseries_df_interpolated = df.copy()
    stock_timeseries_df_interpolated.set_index('date', inplace=True)
    stock_timeseries_df_interpolated = stock_timeseries_df_interpolated.asfreq('B')
    for col in interpolate_list:
        stock_timeseries_df_interpolated[col] = stock_timeseries_df_interpolated[col].interpolate(method='linear')
    stock_timeseries_df_interpolated.reset_index(inplace=True)
    df = stock_timeseries_df_interpolated
    
    # Interpolate 확인
#     print(df.isna().sum())

    timeseries = TimeSeries.from_dataframe(df, time_col='date', freq='B')
    
    
    # Darts TimeSeries 변환 및 날짜 필터링
    stock_timeseries = timeseries['closing_price']
    stock_timeseries = stock_timeseries[start_date:end_date]
    list_stock_timeseries.append(stock_timeseries)

    # Past covariant 및 날짜 필터링
    scaled_past_cov_timeseries = getScaledPastCov(timeseries)
    scaled_past_cov_timeseries = scaled_past_cov_timeseries[start_date:end_date]
    list_scaled_past_cov.append(scaled_past_cov_timeseries)
    
    # Split train/validation set
    train_percentage = 0.8
    split_index = int(train_percentage * len(stock_timeseries))
    
    train_series, val_series = stock_timeseries[:split_index], stock_timeseries[split_index:]
    train_past_cov_series, val_past_cov_series = scaled_past_cov_timeseries[:split_index], scaled_past_cov_timeseries[split_index:]
    
    list_train_scaled_past_cov.append(train_past_cov_series)
    list_val_scaled_past_cov.append(val_past_cov_series)
    
    # Scale
    scaler = Scaler()
    train_series_scaled = scaler.fit_transform(train_series)
    val_series_scaled = scaler.transform(val_series)
    list_scaler.append(scaler)
    
    list_train_series_scaled.append(train_series_scaled)
    list_val_series_scaled.append(val_series_scaled)
    
#     if i % 50 == 49:
#         break

In [None]:
logger = TensorBoardLogger('logs', name='my_model')

# A TorchMetric or val_loss can be used as the monitor
torch_metrics = MetricCollection([MeanAbsoluteError(), MeanSquaredError()])

# Early stop callback
my_stopper = EarlyStopping(
    monitor="val_MeanAbsoluteError",  # "val_loss",
    patience=5,
    min_delta=0.05,
    mode='min',
)
pl_trainer_kwargs = {"callbacks": [my_stopper], "logger": logger}

model = NLinearModel(input_chunk_length=20, output_chunk_length=5, n_epochs=10, random_state=42,
                        torch_metrics=torch_metrics,
                        pl_trainer_kwargs=pl_trainer_kwargs)
model.fit(list_train_series_scaled,
            past_covariates=list_train_scaled_past_cov,
            # future_covariates=list_train_scaled_future_cov
            val_series=list_val_series_scaled,
            val_past_covariates=list_val_scaled_past_cov,
            # val_future_covariates=list_val_scaled_future_cov
            )

In [None]:
model_path = "model/all_past.pkl"
if not os.path.exists('model'):
    os.mkdir('model')
model.save(model_path)
model_loaded = NLinearModel.load(model_path)

In [None]:
# 1. past_covariates를 아무것도 주지 않았을 경우 상승, 하강이 그럴싸하게 나옴. 전부를 주었을 경우 하강만 나옴.
i = 0
predicted_series_scaled = model.predict(n=len(list_val_series_scaled[i]),
                                        series=list_train_series_scaled[i],
                                        past_covariates=list_scaled_past_cov[i],
                                        # future_covariates=list_scaled_future_cov[i],
                                        )
prediction = list_scaler[i].inverse_transform(predicted_series_scaled)
list_stock_timeseries[i].plot()
prediction.plot(label="forecast")
plt.legend()

In [None]:
prediction