# Init

In [1]:
# List of tickers for Korean stocks
tickers = {
    "삼성전자": "005930", "SK": "034730", "한화": "000880",
    "두산": "000150", "기아": "000270", "현대차": "005380",
    "LG": "003550", "NAVER": "035420", "카카오": "035720", "롯데지주": "004990"
}

# Date range for the stock data
start_date = "20200101"
end_date = "20250101"

In [2]:
# Target ticker for analysis
# TARGET_TICKER = "삼성전자"
TARGET_TICKER = list(tickers.keys())[0]
ticker_code = tickers[TARGET_TICKER]
TARGET_TICKER, ticker_code, start_date, end_date

('삼성전자', '005930', '20200101', '20250101')

In [3]:
import numpy as np
import torch

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x2317eb629f0>

# Create Dataset

### Get stock data

In [4]:
import os
import hashlib
import pandas as pd
from pykrx import stock
from datetime import datetime, timedelta

def split_date_range(start_date: str, end_date: str) -> list:
    start = datetime.strptime(start_date, "%Y%m%d")
    end = datetime.strptime(end_date, "%Y%m%d")
    delta = end - start
    split_ranges = []
    # Split the date range into 2-year intervals
    for i in range(0, delta.days + 1, 365 * 2):
        split_start = start + timedelta(days=i)
        split_end = min(end, split_start + timedelta(days=365 * 2 - 1))
        split_ranges.append((split_start.strftime("%Y%m%d"), split_end.strftime("%Y%m%d")))
    return split_ranges

def get_stock_data(ticker_code: str, start_date: str, end_date: str) -> pd.DataFrame:
    key = f"{ticker_code}_{start_date}_{end_date}"
    hashed_cache_name = hashlib.sha1(key.encode()).hexdigest()
    cache_file = f"data/{hashed_cache_name}.csv"
    if not os.path.exists('data'):
        os.makedirs('data')
    try:
        df_cached = pd.read_csv(cache_file)
        print("Loaded cached data")
        return df_cached
    except FileNotFoundError:
        df_ohlcv = stock.get_market_ohlcv_by_date(start_date, end_date, ticker_code)
        df_ohlcv.reset_index(inplace=True)
        df_ohlcv.rename(columns={'날짜':'date', '시가':'open', '고가':'high', '저가':'low', '종가':'close', '거래량':'volume'}, inplace=True)
        df_ohlcv = df_ohlcv[['date', 'open', 'high', 'low', 'close', 'volume']]
        
        df_fundamental = stock.get_market_fundamental_by_date(start_date, end_date, ticker_code)
        df_fundamental.reset_index(inplace=True)
        df_fundamental.rename(columns={'날짜':'date'}, inplace=True)
        df_ohlcv = pd.merge(df_ohlcv, df_fundamental, on='date', how='left')

        df_trading_value = stock.get_market_trading_value_by_date(start_date, end_date, ticker_code)
        df_trading_value.reset_index(inplace=True)
        df_trading_value.rename(columns={
            '날짜': 'date',
            '기관합계': 'institution',
            '외국인합계': 'foreign',
            '개인': 'individual',
            '기타법인': 'other_corporation',
        }, inplace=True)
        df_trading_value.drop(columns=['전체'], inplace=True)
        df_ohlcv = pd.merge(df_ohlcv, df_trading_value, on='date', how='left')

        df_trading_volume = stock.get_market_trading_volume_by_date(start_date, end_date, ticker_code)
        df_trading_volume.reset_index(inplace=True)
        df_trading_volume.rename(columns={
            '날짜': 'date',
            '기관합계': 'institution_volume',
            '외국인합계': 'foreign_volume',
            '개인': 'individual_volume',
            '기타법인': 'other_corporation_volume',
        }, inplace=True)
        df_trading_volume.drop(columns=['전체'], inplace=True)
        df_ohlcv = pd.merge(df_ohlcv, df_trading_volume, on='date', how='left')
        
        split_ranges = split_date_range(start_date, end_date)
        df_shortings = []
        for start, end in split_ranges:
            df_shorting = stock.get_shorting_volume_by_date(start, end, ticker_code)
            df_shorting.reset_index(inplace=True)
            df_shortings.append(df_shorting)
        df_shorting_volume = pd.concat(df_shortings, ignore_index=True)
        df_shorting_volume.rename(columns={
            '날짜': 'date',
            '공매도': 'shorting_volume',
            '매수': 'buy_volume',
            '비중': 'shorting_ratio',
        }, inplace=True)
        df_ohlcv = pd.merge(df_ohlcv, df_shorting_volume, on='date', how='left')

        df_ohlcv.to_csv(cache_file, index=False)
        print("Fetched new data and saved to cache")
        return df_ohlcv

In [5]:
# Fetching stock data for the target ticker
df_stock = get_stock_data(ticker_code, start_date, end_date)
df_stock

Loaded cached data


Unnamed: 0,date,open,high,low,close,volume,BPS,PER,PBR,EPS,...,other_corporation,individual,foreign,institution_volume,other_corporation_volume,individual_volume,foreign_volume,shorting_volume,buy_volume,shorting_ratio
0,2020-01-02,55500,56000,55000,55200,12993228,35342,8.54,1.56,6461,...,7000985800,152706355800,-29293793400,-2354766,126507,2756657,-528398,39485,12993228,0.30
1,2020-01-03,56000,56600,54900,55500,15422255,35342,8.59,1.57,6461,...,-5130936300,66485169500,61978391100,-2228329,-91483,1199681,1120131,218704,15422255,1.42
2,2020-01-06,54900,55600,54600,55500,10278951,35342,8.59,1.57,6461,...,209967000,23109555500,43009120100,-1199654,3796,418722,777136,167348,10278951,1.63
3,2020-01-07,55700,56400,55600,55800,10009778,35342,8.64,1.58,6461,...,-416560800,-722044800,-1833294600,51896,-7458,-10139,-34299,142717,10009778,1.43
4,2020-01-08,56200,57400,55900,56800,23501171,35342,8.79,1.61,6461,...,-11444581500,-234778050700,242001549100,73413,-201951,-4130647,4259185,642430,23501171,2.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1226,2024-12-23,53400,54000,53300,53500,13672650,52002,25.11,1.03,2131,...,57148805300,-37622295300,-44488165300,464199,1062559,-696548,-830210,9083,13672650,0.07
1227,2024-12-24,53700,54500,53600,54400,11634677,52002,25.53,1.05,2131,...,61666134600,-168358856200,103093057000,64871,1140724,-3111933,1906338,56707,11634677,0.49
1228,2024-12-26,54500,54600,53500,53600,10517075,52002,25.15,1.03,2131,...,67237626700,13274611300,-73581477500,-125220,1242129,242434,-1359343,18123,10517075,0.17
1229,2024-12-27,53500,54100,53200,53700,10747196,52002,25.20,1.03,2131,...,55601263000,-39507459700,-2171267300,-261276,1037984,-733514,-43194,25363,10747196,0.24


### Merge with other metrics

In [6]:
def get_index_data(index_code: str, start_date: str, end_date: str) -> pd.DataFrame:
    key = f"{index_code}_{start_date}_{end_date}"
    hashed_cache_name = hashlib.sha1(key.encode()).hexdigest()
    cache_file = f"data/{hashed_cache_name}.csv"
    if not os.path.exists('data'):
        os.makedirs('data')
    try:
        df_cached = pd.read_csv(cache_file)
        print("Loaded cached data")
        return df_cached
    except FileNotFoundError:
        df_ohlcv = stock.get_index_ohlcv_by_date(start_date, end_date, index_code)
        df_ohlcv.reset_index(inplace=True)
        df_ohlcv.rename(columns={'날짜':'date', '시가':'open', '고가':'high', '저가':'low', '종가':'close', '거래량':'volume'}, inplace=True)
        df_ohlcv = df_ohlcv[['date', 'open', 'high', 'low', 'close', 'volume']]
        df_ohlcv.to_csv(cache_file, index=False)
        print("Fetched new data and saved to cache")
        return df_ohlcv

In [7]:
from fredapi import Fred

def get_fred_data(series_id: str, start_date: str, end_date: str) -> pd.DataFrame:
    key = f"{series_id}_{start_date}_{end_date}"
    hashed_cache_name = hashlib.sha1(key.encode()).hexdigest()
    cache_file = f"data/{hashed_cache_name}.csv"
    if not os.path.exists('data'):
        os.makedirs('data')
    try:
        df_cached = pd.read_csv(cache_file)
        print("Loaded cached data")
        return df_cached
    except FileNotFoundError:
        fred = Fred(api_key='1813ca70b0692eac480e11a0691dac96')
        df = fred.get_series(series_id, start_date, end_date)
        df = df.reset_index()
        df.columns = ['date', series_id]
        df.to_csv(cache_file, index=False)
        print("Fetched new data and saved to cache")
        return df

In [8]:
# KOSPI data
df_kospi = get_index_data("1001", start_date, end_date)
df_kospi.rename(columns={'close':'kospi_close'}, inplace=True)
df_stock = pd.merge(df_stock, df_kospi[['date', 'kospi_close']], on='date', how='left')
df_stock

Loaded cached data


Unnamed: 0,date,open,high,low,close,volume,BPS,PER,PBR,EPS,...,individual,foreign,institution_volume,other_corporation_volume,individual_volume,foreign_volume,shorting_volume,buy_volume,shorting_ratio,kospi_close
0,2020-01-02,55500,56000,55000,55200,12993228,35342,8.54,1.56,6461,...,152706355800,-29293793400,-2354766,126507,2756657,-528398,39485,12993228,0.30,2175.17
1,2020-01-03,56000,56600,54900,55500,15422255,35342,8.59,1.57,6461,...,66485169500,61978391100,-2228329,-91483,1199681,1120131,218704,15422255,1.42,2176.46
2,2020-01-06,54900,55600,54600,55500,10278951,35342,8.59,1.57,6461,...,23109555500,43009120100,-1199654,3796,418722,777136,167348,10278951,1.63,2155.07
3,2020-01-07,55700,56400,55600,55800,10009778,35342,8.64,1.58,6461,...,-722044800,-1833294600,51896,-7458,-10139,-34299,142717,10009778,1.43,2175.54
4,2020-01-08,56200,57400,55900,56800,23501171,35342,8.79,1.61,6461,...,-234778050700,242001549100,73413,-201951,-4130647,4259185,642430,23501171,2.73,2151.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1226,2024-12-23,53400,54000,53300,53500,13672650,52002,25.11,1.03,2131,...,-37622295300,-44488165300,464199,1062559,-696548,-830210,9083,13672650,0.07,2442.01
1227,2024-12-24,53700,54500,53600,54400,11634677,52002,25.53,1.05,2131,...,-168358856200,103093057000,64871,1140724,-3111933,1906338,56707,11634677,0.49,2440.52
1228,2024-12-26,54500,54600,53500,53600,10517075,52002,25.15,1.03,2131,...,13274611300,-73581477500,-125220,1242129,242434,-1359343,18123,10517075,0.17,2429.67
1229,2024-12-27,53500,54100,53200,53700,10747196,52002,25.20,1.03,2131,...,-39507459700,-2171267300,-261276,1037984,-733514,-43194,25363,10747196,0.24,2404.77


In [9]:
# Define FRED series IDs to fetch
fred_series = {
    'DEXKOUS': 'exchange_rate', # KRW/USD Exchange Rate
    'DGS10': 'us_10y_yield'     # 10-Year Treasury Constant Maturity Rate
}

df_stock['date'] = pd.to_datetime(df_stock['date'])

for series_id, col_name in fred_series.items():
    print(f"Fetching {col_name} ({series_id}) from FRED...")
    df_fred = get_fred_data(series_id, start_date, end_date)
    df_fred.rename(columns={series_id: col_name}, inplace=True)
    df_fred['date'] = pd.to_datetime(df_fred['date'])
    df_stock = pd.merge(df_stock, df_fred, on='date', how='left')

# Forward-fill missing values (for weekends/holidays) and then back-fill
df_stock.ffill(inplace=True)
df_stock.bfill(inplace=True)

print("\nFRED data merged successfully.")
df_stock.head()

Fetching exchange_rate (DEXKOUS) from FRED...
Loaded cached data
Fetching us_10y_yield (DGS10) from FRED...
Loaded cached data

FRED data merged successfully.


Unnamed: 0,date,open,high,low,close,volume,BPS,PER,PBR,EPS,...,institution_volume,other_corporation_volume,individual_volume,foreign_volume,shorting_volume,buy_volume,shorting_ratio,kospi_close,exchange_rate,us_10y_yield
0,2020-01-02,55500,56000,55000,55200,12993228,35342,8.54,1.56,6461,...,-2354766,126507,2756657,-528398,39485,12993228,0.3,2175.17,1157.95,1.88
1,2020-01-03,56000,56600,54900,55500,15422255,35342,8.59,1.57,6461,...,-2228329,-91483,1199681,1120131,218704,15422255,1.42,2176.46,1165.15,1.8
2,2020-01-06,54900,55600,54600,55500,10278951,35342,8.59,1.57,6461,...,-1199654,3796,418722,777136,167348,10278951,1.63,2155.07,1167.49,1.81
3,2020-01-07,55700,56400,55600,55800,10009778,35342,8.64,1.58,6461,...,51896,-7458,-10139,-34299,142717,10009778,1.43,2175.54,1166.21,1.83
4,2020-01-08,56200,57400,55900,56800,23501171,35342,8.79,1.61,6461,...,73413,-201951,-4130647,4259185,642430,23501171,2.73,2151.31,1170.61,1.87


In [10]:
# Final stock data with other metrics
df_stock

Unnamed: 0,date,open,high,low,close,volume,BPS,PER,PBR,EPS,...,institution_volume,other_corporation_volume,individual_volume,foreign_volume,shorting_volume,buy_volume,shorting_ratio,kospi_close,exchange_rate,us_10y_yield
0,2020-01-02,55500,56000,55000,55200,12993228,35342,8.54,1.56,6461,...,-2354766,126507,2756657,-528398,39485,12993228,0.30,2175.17,1157.95,1.88
1,2020-01-03,56000,56600,54900,55500,15422255,35342,8.59,1.57,6461,...,-2228329,-91483,1199681,1120131,218704,15422255,1.42,2176.46,1165.15,1.80
2,2020-01-06,54900,55600,54600,55500,10278951,35342,8.59,1.57,6461,...,-1199654,3796,418722,777136,167348,10278951,1.63,2155.07,1167.49,1.81
3,2020-01-07,55700,56400,55600,55800,10009778,35342,8.64,1.58,6461,...,51896,-7458,-10139,-34299,142717,10009778,1.43,2175.54,1166.21,1.83
4,2020-01-08,56200,57400,55900,56800,23501171,35342,8.79,1.61,6461,...,73413,-201951,-4130647,4259185,642430,23501171,2.73,2151.31,1170.61,1.87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1226,2024-12-23,53400,54000,53300,53500,13672650,52002,25.11,1.03,2131,...,464199,1062559,-696548,-830210,9083,13672650,0.07,2442.01,1451.76,4.59
1227,2024-12-24,53700,54500,53600,54400,11634677,52002,25.53,1.05,2131,...,64871,1140724,-3111933,1906338,56707,11634677,0.49,2440.52,1457.36,4.59
1228,2024-12-26,54500,54600,53500,53600,10517075,52002,25.15,1.03,2131,...,-125220,1242129,242434,-1359343,18123,10517075,0.17,2429.67,1467.66,4.58
1229,2024-12-27,53500,54100,53200,53700,10747196,52002,25.20,1.03,2131,...,-261276,1037984,-733514,-43194,25363,10747196,0.24,2404.77,1470.40,4.62


### Add target column

In [11]:
# Set the future close price as the target
df_stock['target'] = df_stock['close'].shift(-1)

df_stock.dropna(subset=['target'], inplace=True)

print("Shape of the final dataframe:", df_stock.shape)
print("Columns:", df_stock.columns)
df_stock

Shape of the final dataframe: (1230, 27)
Columns: Index(['date', 'open', 'high', 'low', 'close', 'volume', 'BPS', 'PER', 'PBR',
       'EPS', 'DIV', 'DPS', 'institution', 'other_corporation', 'individual',
       'foreign', 'institution_volume', 'other_corporation_volume',
       'individual_volume', 'foreign_volume', 'shorting_volume', 'buy_volume',
       'shorting_ratio', 'kospi_close', 'exchange_rate', 'us_10y_yield',
       'target'],
      dtype='object')


Unnamed: 0,date,open,high,low,close,volume,BPS,PER,PBR,EPS,...,other_corporation_volume,individual_volume,foreign_volume,shorting_volume,buy_volume,shorting_ratio,kospi_close,exchange_rate,us_10y_yield,target
0,2020-01-02,55500,56000,55000,55200,12993228,35342,8.54,1.56,6461,...,126507,2756657,-528398,39485,12993228,0.30,2175.17,1157.95,1.88,55500.0
1,2020-01-03,56000,56600,54900,55500,15422255,35342,8.59,1.57,6461,...,-91483,1199681,1120131,218704,15422255,1.42,2176.46,1165.15,1.80,55500.0
2,2020-01-06,54900,55600,54600,55500,10278951,35342,8.59,1.57,6461,...,3796,418722,777136,167348,10278951,1.63,2155.07,1167.49,1.81,55800.0
3,2020-01-07,55700,56400,55600,55800,10009778,35342,8.64,1.58,6461,...,-7458,-10139,-34299,142717,10009778,1.43,2175.54,1166.21,1.83,56800.0
4,2020-01-08,56200,57400,55900,56800,23501171,35342,8.79,1.61,6461,...,-201951,-4130647,4259185,642430,23501171,2.73,2151.31,1170.61,1.87,58600.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,2024-12-20,52700,53100,51900,53000,24674774,52002,24.87,1.02,2131,...,1440393,4687038,-4792393,7267,24674774,0.03,2404.15,1444.52,4.52,53500.0
1226,2024-12-23,53400,54000,53300,53500,13672650,52002,25.11,1.03,2131,...,1062559,-696548,-830210,9083,13672650,0.07,2442.01,1451.76,4.59,54400.0
1227,2024-12-24,53700,54500,53600,54400,11634677,52002,25.53,1.05,2131,...,1140724,-3111933,1906338,56707,11634677,0.49,2440.52,1457.36,4.59,53600.0
1228,2024-12-26,54500,54600,53500,53600,10517075,52002,25.15,1.03,2131,...,1242129,242434,-1359343,18123,10517075,0.17,2429.67,1467.66,4.58,53700.0


### Get news

In [12]:
from gnews import GNews

days_for_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
def get_news(name, start_date):
    google_news = GNews()
    google_news.max_results = 1
    google_news.language = 'ko'
    google_news.country = 'KR'

    # Set end date as one day after the start date
    end_date = list(start_date)
    end_date[2] += 1
    if end_date[2] > days_for_month[end_date[1] - 1]:
        end_date[2] = 1
        end_date[1] += 1
    if end_date[1] > 12:
        end_date[1] = 1
        end_date[0] += 1


    google_news.end_date = tuple(end_date)
    google_news.start_date = start_date
    try:
        news = google_news.get_news(name)
    except Exception:
        news = None
    if news is None or not isinstance(news, list):
        return []
    return [item['title'] for item in news if item['title']]

In [13]:
import time
from threading import Thread, Lock
from queue import Queue, Empty

def get_news_threaded(thread_count, start_dates):
    news_list = []
    news_dict = {}
    q = Queue()
    lock = Lock()
    count = 0

    def add_news(index,start_date):
        nonlocal count
        news_titles = get_news(TARGET_TICKER, tuple(start_date))
        q.put((index, news_titles))
        with lock:
            count -= 1

    last = time.time()
    for index, start_date in enumerate(start_dates):
        while count >= thread_count:
            try:
                get_index, news_titles = q.get_nowait()
                news_dict[get_index] = news_titles
            except Empty:
                time.sleep(0.1)
        thread = Thread(target=add_news, args=(index, start_date,))
        thread.start()
        with lock:
            count += 1
        if time.time() - last > 1:
            print(f"Processing: {len(news_dict)}/{len(start_dates)} (thread: {count}/{thread_count})", end='\r')
            last = time.time()

    last = time.time()
    while count > 0:
        index, news_titles = q.get()
        news_dict[index] = news_titles
        if time.time() - last > 1:
            print(f"Processing: {len(news_dict)}/{len(start_dates)} (thread: {count}/{thread_count})", end='\r')
            last = time.time()

    keys = sorted(news_dict.keys())
    for key in keys:
        news_list.append(news_dict[key])
    
    if len(news_dict) != len(start_dates):
        raise ValueError(f"News data count mismatch: {len(news_dict)} vs {len(start_dates)}")
    
    print(f"Processing complete: {len(news_list)}/{len(start_dates)}" + " "*30, end='\r')

    return news_list

In [14]:
def try_refill_na_news():
    # Find empty news data
    error_count = 0
    for i, row in df_stock.iterrows():
        if len(row['news']) == 0:
            print(f"Empty news data found: {str(row['date']).split()[0]}")
            error_count += 1
    
    if error_count > 0:
        print(f"Total {error_count} empty news data found. Collecting additional news data...")
        thread_count = 32
        start_dates = [list(map(int, row['date'].strftime('%Y %m %d').split(" "))) for _, row in df_stock.iterrows() if len(row['news']) == 0]
    
        news_list = get_news_threaded(thread_count, start_dates)
    
        for i, row in df_stock.iterrows():
            if len(row['news']) == 0:
                df_stock.at[i, 'news'] = news_list.pop(0)

In [15]:
def save_news_data():
    # save the final news_list to json
    # data key - news value -> dict[str, str]
    news_dict = {str(row['date']).split()[0]: row['news'] for _, row in df_stock.iterrows()}
    with open(f"{ticker_code}_news.json", 'w', encoding='utf-8') as f:
        import json
        json.dump(news_dict, f, ensure_ascii=False, indent=4)

In [16]:
# Merge news data with stock data
thread_count = 32
start_dates = [list(map(int, row['date'].strftime('%Y %m %d').split(" "))) for _, row in df_stock.iterrows()]

# check if news_list already exists
try:
    with open(f"{ticker_code}_news.json", 'r', encoding='utf-8') as f:
        import json
        news_dict = json.load(f)
    print(f"Existing news data loaded: {len(news_dict)} items")
    news_list = [news_dict.get(str(row['date']).split()[0], "") for _, row in df_stock.iterrows()]
except FileNotFoundError:
    print(f"Existing news data not found, collecting new data...")

    # test before running the full crawl
    print("Testing news collection with a few dates...")
    test_res = get_news_threaded(16, [(2024, 12, 26), (2025, 1, 2), (2025, 1, 3), (2025, 1, 4), (2025, 1, 5)])
    assert len(test_res) == 5, "Test news collection failed, please check the GNews API or network connection."

    # Collect news data using threading
    print("Collecting news data...     ")
    news_list = get_news_threaded(thread_count, start_dates)
    
df_stock['news'] = news_list
try_refill_na_news()
save_news_data()

Existing news data loaded: 1230 items


In [17]:
df_stock

Unnamed: 0,date,open,high,low,close,volume,BPS,PER,PBR,EPS,...,individual_volume,foreign_volume,shorting_volume,buy_volume,shorting_ratio,kospi_close,exchange_rate,us_10y_yield,target,news
0,2020-01-02,55500,56000,55000,55200,12993228,35342,8.54,1.56,6461,...,2756657,-528398,39485,12993228,0.30,2175.17,1157.95,1.88,55500.0,"[삼성전자, CES2020서 게이밍 모니터 ‘오디세이’ 신모델 첫 공개 - Sams..."
1,2020-01-03,56000,56600,54900,55500,15422255,35342,8.59,1.57,6461,...,1199681,1120131,218704,15422255,1.42,2176.46,1165.15,1.80,55500.0,"[삼성전자, CES2020서 게이밍 모니터 ‘오디세이’ 신모델 첫 공개 - Sams..."
2,2020-01-06,54900,55600,54600,55500,10278951,35342,8.59,1.57,6461,...,418722,777136,167348,10278951,1.63,2155.07,1167.49,1.81,55800.0,[삼성전자가 열어갈 미래는? CES 2020 키노트 요약정리 - Samsung Ne...
3,2020-01-07,55700,56400,55600,55800,10009778,35342,8.64,1.58,6461,...,-10139,-34299,142717,10009778,1.43,2175.54,1166.21,1.83,56800.0,[삼성전자가 열어갈 미래는? CES 2020 키노트 요약정리 - Samsung Ne...
4,2020-01-08,56200,57400,55900,56800,23501171,35342,8.79,1.61,6461,...,-4130647,4259185,642430,23501171,2.73,2151.31,1170.61,1.87,58600.0,[“미래에서 온 게이밍 모니터” 삼성 ‘오디세이’ 디자인 스토리 - Samsung ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,2024-12-20,52700,53100,51900,53000,24674774,52002,24.87,1.02,2131,...,4687038,-4792393,7267,24674774,0.03,2404.15,1444.52,4.52,53500.0,"[美, 삼성전자 보조금 최종 결정…26% 줄어든 47.5억 달러 지급 - 중앙일보,..."
1226,2024-12-23,53400,54000,53300,53500,13672650,52002,25.11,1.03,2131,...,-696548,-830210,9083,13672650,0.07,2442.01,1451.76,4.59,54400.0,"[삼성전자, 미국 반도체 보조금 7조원 받는다 - 블로터, SK하이닉스, 내년엔 삼..."
1227,2024-12-24,53700,54500,53600,54400,11634677,52002,25.53,1.05,2131,...,-3111933,1906338,56707,11634677,0.49,2440.52,1457.36,4.59,53600.0,"[삼성전자, CES 2025서 ‘가정용 히트펌프 EHS’ 美 시장에 첫 선봬 - s..."
1228,2024-12-26,54500,54600,53500,53600,10517075,52002,25.15,1.03,2131,...,242434,-1359343,18123,10517075,0.17,2429.67,1467.66,4.58,53700.0,"[삼성전자 360조 투자 '용인 반도체 국가산단' 2026년 착공 - 중부일보, 삼..."


### Add technical features

In [18]:
from ta.trend import MACD
from ta.momentum import RSIIndicator
from ta.volatility import BollingerBands
from ta.volume import OnBalanceVolumeIndicator

# Make a clean copy to add features to
df_with_ta = df_stock.copy()

print("📊 Adding selected technical indicators...")

# 1. Moving Average Convergence Divergence (MACD)
indicator_macd = MACD(close=df_with_ta['close'], window_slow=26, window_fast=12, window_sign=9, fillna=True)
df_with_ta['macd'] = indicator_macd.macd()
df_with_ta['macd_signal'] = indicator_macd.macd_signal()

# 2. Relative Strength Index (RSI)
df_with_ta['rsi'] = RSIIndicator(close=df_with_ta['close'], window=14, fillna=True).rsi()

# 3. Bollinger Bands (BB)
indicator_bb = BollingerBands(close=df_with_ta['close'], window=20, window_dev=2, fillna=True)
df_with_ta['bb_hband'] = indicator_bb.bollinger_hband() # Upper band
df_with_ta['bb_lband'] = indicator_bb.bollinger_lband() # Lower band
df_with_ta['bb_pband'] = indicator_bb.bollinger_pband() # Percentage band
df_with_ta['bb_wband'] = indicator_bb.bollinger_wband() # Width band

# 4. On-Balance Volume (OBV)
df_with_ta['obv'] = OnBalanceVolumeIndicator(close=df_with_ta['close'], volume=df_with_ta['volume'], fillna=True).on_balance_volume()

# The original df_stock is now the one with the new, selected features
df_stock = df_with_ta

# Drop first row
df_stock.drop(index=df_stock.index[0], inplace=True)

df_stock

📊 Adding selected technical indicators...


Unnamed: 0,date,open,high,low,close,volume,BPS,PER,PBR,EPS,...,target,news,macd,macd_signal,rsi,bb_hband,bb_lband,bb_pband,bb_wband,obv
1,2020-01-03,56000,56600,54900,55500,15422255,35342,8.59,1.57,6461,...,55500.0,"[삼성전자, CES2020서 게이밍 모니터 ‘오디세이’ 신모델 첫 공개 - Sams...",23.931624,4.786325,100.000000,55650.000000,55050.000000,0.750000,1.084011,28415483
2,2020-01-06,54900,55600,54600,55500,10278951,35342,8.59,1.57,6461,...,55800.0,[삼성전자가 열어갈 미래는? CES 2020 키노트 요약정리 - Samsung Ne...,42.408747,12.310809,100.000000,55682.842712,55117.157288,0.676777,1.021093,38694434
3,2020-01-07,55700,56400,55600,55800,10009778,35342,8.64,1.58,6461,...,56800.0,[삼성전자가 열어갈 미래는? CES 2020 키노트 요약정리 - Samsung Ne...,80.333458,25.915339,100.000000,55924.264069,55075.735931,0.853553,1.528880,48704212
4,2020-01-08,56200,57400,55900,56800,23501171,35342,8.79,1.61,6461,...,58600.0,[“미래에서 온 게이밍 모니터” 삼성 ‘오디세이’ 디자인 스토리 - Samsung ...,188.903150,58.512901,100.000000,56867.068200,54652.931800,0.969709,3.970833,72205383
5,2020-01-09,58400,58600,57400,58600,24102579,35342,9.07,1.66,6461,...,59500.0,[“미래에서 온 게이밍 모니터” 삼성 ‘오디세이’ 디자인 스토리 - Samsung ...,415.401876,129.890696,100.000000,58579.014945,53887.651722,1.004473,8.342673,96307962
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,2024-12-20,52700,53100,51900,53000,24674774,52002,24.87,1.02,2131,...,53500.0,"[美, 삼성전자 보조금 최종 결정…26% 줄어든 47.5억 달러 지급 - 중앙일보,...",-688.577096,-725.424093,41.085816,57744.850990,51705.149010,0.214390,11.036459,324928906
1226,2024-12-23,53400,54000,53300,53500,13672650,52002,25.11,1.03,2131,...,54400.0,"[삼성전자, 미국 반도체 보조금 7조원 받는다 - 블로터, SK하이닉스, 내년엔 삼...",-702.564548,-720.852184,43.517266,57190.125695,51819.874305,0.312858,9.852768,338601556
1227,2024-12-24,53700,54500,53600,54400,11634677,52002,25.53,1.05,2131,...,53600.0,"[삼성전자, CES 2025서 ‘가정용 히트펌프 EHS’ 美 시장에 첫 선봬 - s...",-633.722074,-703.426162,47.701284,56354.407004,52265.592996,0.522011,7.528658,350236233
1228,2024-12-26,54500,54600,53500,53600,10517075,52002,25.15,1.03,2131,...,53700.0,"[삼성전자 360조 투자 '용인 반도체 국가산단' 2026년 착공 - 중부일보, 삼...",-636.381422,-690.017214,44.542740,56023.107140,52326.892860,0.344435,6.822731,339719158


### Split into train, val, test

In [19]:
from sklearn.model_selection import train_test_split

# Split the df_stock into training, validation, and test sets (80% train, 10% validation, 10% test)
train_val_df, test_df = train_test_split(
    df_stock, test_size=0.1, shuffle=False
)
train_df, val_df = train_test_split(
    train_val_df, test_size=0.125, shuffle=False  # 0.8 * 0.125 = 0.1
)

train_df

Unnamed: 0,date,open,high,low,close,volume,BPS,PER,PBR,EPS,...,target,news,macd,macd_signal,rsi,bb_hband,bb_lband,bb_pband,bb_wband,obv
1,2020-01-03,56000,56600,54900,55500,15422255,35342,8.59,1.57,6461,...,55500.0,"[삼성전자, CES2020서 게이밍 모니터 ‘오디세이’ 신모델 첫 공개 - Sams...",23.931624,4.786325,100.000000,55650.000000,55050.000000,0.750000,1.084011,28415483
2,2020-01-06,54900,55600,54600,55500,10278951,35342,8.59,1.57,6461,...,55800.0,[삼성전자가 열어갈 미래는? CES 2020 키노트 요약정리 - Samsung Ne...,42.408747,12.310809,100.000000,55682.842712,55117.157288,0.676777,1.021093,38694434
3,2020-01-07,55700,56400,55600,55800,10009778,35342,8.64,1.58,6461,...,56800.0,[삼성전자가 열어갈 미래는? CES 2020 키노트 요약정리 - Samsung Ne...,80.333458,25.915339,100.000000,55924.264069,55075.735931,0.853553,1.528880,48704212
4,2020-01-08,56200,57400,55900,56800,23501171,35342,8.79,1.61,6461,...,58600.0,[“미래에서 온 게이밍 모니터” 삼성 ‘오디세이’ 디자인 스토리 - Samsung ...,188.903150,58.512901,100.000000,56867.068200,54652.931800,0.969709,3.970833,72205383
5,2020-01-09,58400,58600,57400,58600,24102579,35342,9.07,1.66,6461,...,59500.0,[“미래에서 온 게이밍 모니터” 삼성 ‘오디세이’ 디자인 스토리 - Samsung ...,415.401876,129.890696,100.000000,58579.014945,53887.651722,1.004473,8.342673,96307962
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
963,2023-11-24,72400,72600,71700,71700,6676685,50817,8.90,1.41,8057,...,71300.0,"[삼성전자서비스, ‘2023 서비스 기술경진대회’ 개최 - Samsung Newsr...",984.979817,883.283819,57.025807,74252.145800,67317.854200,0.631953,9.796273,430194706
964,2023-11-27,71500,72100,71100,71300,9113857,50817,8.85,1.40,8057,...,72700.0,"[삼성전자, 베트남 대학생 대상 ‘갤럭시 캠퍼스 프렌즈’ 발대식 열어 - Samsu...",875.886991,881.804453,53.935242,74064.788954,67905.211046,0.551140,8.677295,421080849
965,2023-11-28,71400,72700,71300,72700,13283081,50817,9.02,1.43,8057,...,72700.0,"[삼성전자 2024년 정기 임원 인사 - 바끄로뉴스, 삼성전자, 베트남 대학생 대상...",892.114696,883.866502,61.749031,73804.723305,68745.276695,0.781651,7.098487,434363930
966,2023-11-29,72400,72800,72200,72700,9283933,50817,9.02,1.43,8057,...,72800.0,"[삼성전자 2024년 정기 임원 인사 - 바끄로뉴스, 삼성전자, 39세 상무·46세...",894.662162,886.025634,61.749031,73761.753712,69198.246288,0.767338,6.384314,443647863


### Create batches sliding window

In [20]:
# Create sliding windows for the time series data
def create_sliding_windows(data: pd.DataFrame, sequence_length=10) -> list[pd.DataFrame]:
    results = []
    for i in range(len(data) - sequence_length):
        results.append(data.iloc[i:i + sequence_length].copy())
    return results

SEQ_LENGTH = 20 # Length of the sliding window
# Create sliding windows for the training, validation, and test sets
train = create_sliding_windows(train_df, SEQ_LENGTH)
val = create_sliding_windows(val_df, SEQ_LENGTH)
test = create_sliding_windows(test_df, SEQ_LENGTH)

train[0]

Unnamed: 0,date,open,high,low,close,volume,BPS,PER,PBR,EPS,...,target,news,macd,macd_signal,rsi,bb_hband,bb_lband,bb_pband,bb_wband,obv
1,2020-01-03,56000,56600,54900,55500,15422255,35342,8.59,1.57,6461,...,55500.0,"[삼성전자, CES2020서 게이밍 모니터 ‘오디세이’ 신모델 첫 공개 - Sams...",23.931624,4.786325,100.0,55650.0,55050.0,0.75,1.084011,28415483
2,2020-01-06,54900,55600,54600,55500,10278951,35342,8.59,1.57,6461,...,55800.0,[삼성전자가 열어갈 미래는? CES 2020 키노트 요약정리 - Samsung Ne...,42.408747,12.310809,100.0,55682.842712,55117.157288,0.676777,1.021093,38694434
3,2020-01-07,55700,56400,55600,55800,10009778,35342,8.64,1.58,6461,...,56800.0,[삼성전자가 열어갈 미래는? CES 2020 키노트 요약정리 - Samsung Ne...,80.333458,25.915339,100.0,55924.264069,55075.735931,0.853553,1.52888,48704212
4,2020-01-08,56200,57400,55900,56800,23501171,35342,8.79,1.61,6461,...,58600.0,[“미래에서 온 게이밍 모니터” 삼성 ‘오디세이’ 디자인 스토리 - Samsung ...,188.90315,58.512901,100.0,56867.0682,54652.9318,0.969709,3.970833,72205383
5,2020-01-09,58400,58600,57400,58600,24102579,35342,9.07,1.66,6461,...,59500.0,[“미래에서 온 게이밍 모니터” 삼성 ‘오디세이’ 디자인 스토리 - Samsung ...,415.401876,129.890696,100.0,58579.014945,53887.651722,1.004473,8.342673,96307962
6,2020-01-10,58800,59700,58300,59500,16000170,35342,9.21,1.68,6461,...,60000.0,"[삼성전자 액면분할 전 2000억 '몰빵' 슈퍼개미 근황은… - 한국경제, 스마트홈...",659.919089,235.896375,100.0,59853.229637,53546.770363,0.943989,11.122503,112308132
7,2020-01-13,59600,60000,59100,60000,11359139,35342,9.29,1.7,6461,...,60000.0,"[삼성전자, 미국 5G·4G LTE 망설계·최적화 전문기업 텔레월드 솔루션즈 인수 ...",883.858,365.4887,100.0,60781.883463,53443.116537,0.893458,12.849669,123667271
8,2020-01-14,60400,61000,59900,60000,16906295,35342,9.29,1.7,6461,...,59000.0,"[삼성전자, 미국 5G·4G LTE 망설계·최적화 전문기업 텔레월드 솔루션즈 인수 ...",1049.236257,502.238211,100.0,61340.022753,53526.643914,0.828496,13.604258,140573566
9,2020-01-15,59500,59600,58900,59000,14300928,35342,9.13,1.67,6461,...,60700.0,"[삼성전자, 2020년형 ‘무풍에어컨’·‘무풍큐브’ 공개 - Samsung News...",1087.076928,619.205955,77.966052,61413.558552,53766.441448,0.684383,13.27855,126272638
10,2020-01-16,59100,60700,59000,60700,14381774,35342,9.39,1.72,6461,...,61300.0,"[삼성전자, ‘갤럭시 A10e’ 출시 - Samsung Newsroom, [Who ...",1239.948372,743.354438,84.299492,61933.260967,53812.193578,0.848141,14.032633,140654412


In [21]:
len(train), len(val), len(test)

(947, 119, 103)

## Save

In [22]:
# Save the train, val, and test sets (list of DataFrames) to parquet files
# Since each set is a list of DataFrames (sliding windows), we'll save them separately
# and add metadata to reconstruct the sliding windows

import os
import pandas as pd
import json

# Create a directory for the dataset if it doesn't exist
dataset_dir = f"{ticker_code}_dataset"
os.makedirs(dataset_dir, exist_ok=True)

# Save train set
train_combined = pd.concat([df.assign(window_id=i) for i, df in enumerate(train)], ignore_index=True)
train_combined.to_parquet(f"{dataset_dir}/train.parquet", index=False)

# Save validation set
val_combined = pd.concat([df.assign(window_id=i) for i, df in enumerate(val)], ignore_index=True)
val_combined.to_parquet(f"{dataset_dir}/val.parquet", index=False)

# Save test set
test_combined = pd.concat([df.assign(window_id=i) for i, df in enumerate(test)], ignore_index=True)
test_combined.to_parquet(f"{dataset_dir}/test.parquet", index=False)

# Save metadata about the sliding windows
metadata = {
    "seq_length": SEQ_LENGTH,
    "train_windows": len(train),
    "val_windows": len(val),
    "test_windows": len(test),
    "ticker_code": ticker_code,
    "target_ticker": TARGET_TICKER
}

with open(f"{dataset_dir}/metadata.json", 'w', encoding="UTF-8") as f:
    json.dump(metadata, f, indent=4, ensure_ascii=False)

print(f"Dataset saved to {dataset_dir}/ directory")
print(f"Train windows: {len(train)}, Val windows: {len(val)}, Test windows: {len(test)}")
print("\n--- Dataset saved successfully ---")

Dataset saved to 005930_dataset/ directory
Train windows: 947, Val windows: 119, Test windows: 103

--- Dataset saved successfully ---


## Load (test)

In [23]:
# Load the train, val, and test sets (list of DataFrames) from parquet files
import os
import json
import pandas as pd

dataset_dir = f"{ticker_code}_dataset"
train = []
val = []
test = []

if os.path.exists(dataset_dir):
    print(f"Loading dataset from {dataset_dir}/ directory...")
    
    # Load metadata
    with open(f"{dataset_dir}/metadata.json", 'r') as f:
        metadata = json.load(f)
    
    SEQ_LENGTH = metadata["seq_length"]
    print(f"Sequence length: {SEQ_LENGTH}")
    print(f"Ticker: {metadata['target_ticker']} ({metadata['ticker_code']})")
    assert SEQ_LENGTH > 0, "Sequence length must be greater than 0"
    assert ticker_code == metadata['ticker_code'], "Ticker code mismatch in metadata"
    assert TARGET_TICKER == metadata['target_ticker'], "Target ticker mismatch in metadata"
    
    # Load and reconstruct train set
    train_combined = pd.read_parquet(f"{dataset_dir}/train.parquet")
    for window_id in range(metadata["train_windows"]):
        window_df = train_combined[train_combined['window_id'] == window_id].drop('window_id', axis=1).reset_index(drop=True)
        train.append(window_df)
    
    # Load and reconstruct validation set
    val_combined = pd.read_parquet(f"{dataset_dir}/val.parquet")
    for window_id in range(metadata["val_windows"]):
        window_df = val_combined[val_combined['window_id'] == window_id].drop('window_id', axis=1).reset_index(drop=True)
        val.append(window_df)
    
    # Load and reconstruct test set
    test_combined = pd.read_parquet(f"{dataset_dir}/test.parquet")
    for window_id in range(metadata["test_windows"]):
        window_df = test_combined[test_combined['window_id'] == window_id].drop('window_id', axis=1).reset_index(drop=True)
        test.append(window_df)
    
    print(f"Loaded - Train windows: {len(train)}, Val windows: {len(val)}, Test windows: {len(test)}")
    print("Dataset loaded successfully!")
else:
    print(f"Dataset directory {dataset_dir} not found. Please run the dataset creation cells first.")


Loading dataset from 005930_dataset/ directory...
Sequence length: 20
Ticker: 삼성전자 (005930)
Loaded - Train windows: 947, Val windows: 119, Test windows: 103
Dataset loaded successfully!
