# Init

In [1]:
# List of tickers for Korean stocks
tickers = {
    "삼성전자": "005930", "SK": "034730", "한화": "000880",
    "두산": "000150", "기아": "000270", "현대차": "005380",
    "LG": "003550", "NAVER": "035420", "카카오": "035720", "롯데지주": "004990"
}

# Date range for the stock data
start_date = "20200101"
end_date = "20250101"

In [2]:
# Target ticker for analysis
TARGET_TICKER = "삼성전자"
ticker_code = tickers[TARGET_TICKER]

In [264]:
import numpy as np
import torch

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x229bfe66830>

# Create Dataset

### Get stock data

In [265]:
import os
import hashlib
import pandas as pd
from pykrx import stock
from datetime import datetime, timedelta

def split_date_range(start_date: str, end_date: str) -> list:
    start = datetime.strptime(start_date, "%Y%m%d")
    end = datetime.strptime(end_date, "%Y%m%d")
    delta = end - start
    split_ranges = []
    # Split the date range into 2-year intervals
    for i in range(0, delta.days + 1, 365 * 2):
        split_start = start + timedelta(days=i)
        split_end = min(end, split_start + timedelta(days=365 * 2 - 1))
        split_ranges.append((split_start.strftime("%Y%m%d"), split_end.strftime("%Y%m%d")))
    return split_ranges

def get_stock_data(ticker_code: str, start_date: str, end_date: str) -> pd.DataFrame:
    key = f"{ticker_code}_{start_date}_{end_date}_{pd.Timestamp.now().strftime('%Y%m%d')}"
    hashed_cache_name = hashlib.sha1(key.encode()).hexdigest()
    cache_file = f"data/{hashed_cache_name}.csv"
    if not os.path.exists('data'):
        os.makedirs('data')
    try:
        df_cached = pd.read_csv(cache_file)
        print("Loaded cached data")
        return df_cached
    except FileNotFoundError:
        df_ohlcv = stock.get_market_ohlcv_by_date(start_date, end_date, ticker_code)
        df_ohlcv.reset_index(inplace=True)
        df_ohlcv.rename(columns={'날짜':'date', '시가':'open', '고가':'high', '저가':'low', '종가':'close', '거래량':'volume'}, inplace=True)
        df_ohlcv = df_ohlcv[['date', 'open', 'high', 'low', 'close', 'volume']]
        
        df_fundamental = stock.get_market_fundamental_by_date(start_date, end_date, ticker_code)
        df_fundamental.reset_index(inplace=True)
        df_fundamental.rename(columns={'날짜':'date'}, inplace=True)
        df_ohlcv = pd.merge(df_ohlcv, df_fundamental, on='date', how='left')

        df_trading_value = stock.get_market_trading_value_by_date(start_date, end_date, ticker_code)
        df_trading_value.reset_index(inplace=True)
        df_trading_value.rename(columns={
            '날짜': 'date',
            '기관합계': 'institution',
            '외국인합계': 'foreign',
            '개인': 'individual',
            '기타법인': 'other_corporation',
        }, inplace=True)
        df_trading_value.drop(columns=['전체'], inplace=True)
        df_ohlcv = pd.merge(df_ohlcv, df_trading_value, on='date', how='left')

        df_trading_volume = stock.get_market_trading_volume_by_date(start_date, end_date, ticker_code)
        df_trading_volume.reset_index(inplace=True)
        df_trading_volume.rename(columns={
            '날짜': 'date',
            '기관합계': 'institution_volume',
            '외국인합계': 'foreign_volume',
            '개인': 'individual_volume',
            '기타법인': 'other_corporation_volume',
        }, inplace=True)
        df_trading_volume.drop(columns=['전체'], inplace=True)
        df_ohlcv = pd.merge(df_ohlcv, df_trading_volume, on='date', how='left')
        
        split_ranges = split_date_range(start_date, end_date)
        df_shortings = []
        for start, end in split_ranges:
            df_shorting = stock.get_shorting_volume_by_date(start, end, ticker_code)
            df_shorting.reset_index(inplace=True)
            df_shortings.append(df_shorting)
        df_shorting_volume = pd.concat(df_shortings, ignore_index=True)
        df_shorting_volume.rename(columns={
            '날짜': 'date',
            '공매도': 'shorting_volume',
            '매수': 'buy_volume',
            '비중': 'shorting_ratio',
        }, inplace=True)
        df_ohlcv = pd.merge(df_ohlcv, df_shorting_volume, on='date', how='left')

        df_ohlcv.to_csv(cache_file, index=False)
        print("Fetched new data and saved to cache")
        return df_ohlcv

In [266]:
df_stocks = []
for ticker_name, ticker_code in tickers.items():
    # Fetching stock data for the target ticker
    df_stock = get_stock_data(ticker_code, start_date, end_date)
    df_stocks.append(df_stock)

Fetched new data and saved to cache


KeyboardInterrupt: 

### Merge with other metrics

In [267]:
def get_index_data(index_code: str, start_date: str, end_date: str) -> pd.DataFrame:
    key = f"{index_code}_{start_date}_{end_date}_{pd.Timestamp.now().strftime('%Y%m%d')}"
    hashed_cache_name = hashlib.sha1(key.encode()).hexdigest()
    cache_file = f"data/{hashed_cache_name}.csv"
    if not os.path.exists('data'):
        os.makedirs('data')
    try:
        df_cached = pd.read_csv(cache_file)
        print("Loaded cached data")
        return df_cached
    except FileNotFoundError:
        df_ohlcv = stock.get_index_ohlcv_by_date(start_date, end_date, index_code)
        df_ohlcv.reset_index(inplace=True)
        df_ohlcv.rename(columns={'날짜':'date', '시가':'open', '고가':'high', '저가':'low', '종가':'close', '거래량':'volume'}, inplace=True)
        df_ohlcv = df_ohlcv[['date', 'open', 'high', 'low', 'close', 'volume']]
        df_ohlcv.to_csv(cache_file, index=False)
        print("Fetched new data and saved to cache")
        return df_ohlcv

In [268]:
from fredapi import Fred

def get_fred_data(series_id: str, start_date: str, end_date: str) -> pd.DataFrame:
    key = f"{series_id}_{start_date}_{end_date}_{pd.Timestamp.now().strftime('%Y%m%d')}"
    hashed_cache_name = hashlib.sha1(key.encode()).hexdigest()
    cache_file = f"data/{hashed_cache_name}.csv"
    if not os.path.exists('data'):
        os.makedirs('data')
    try:
        df_cached = pd.read_csv(cache_file)
        print("Loaded cached data")
        return df_cached
    except FileNotFoundError:
        fred = Fred(api_key='1813ca70b0692eac480e11a0691dac96')
        df = fred.get_series(series_id, start_date, end_date)
        df = df.reset_index()
        df.columns = ['date', series_id]
        df.to_csv(cache_file, index=False)
        print("Fetched new data and saved to cache")
        return df

In [269]:
# KOSPI data
for df_stock in df_stocks:
    df_kospi = get_index_data("1001", start_date, end_date)
    df_kospi.rename(columns={'close':'kospi_close'}, inplace=True)
    df_stock = pd.merge(df_stock, df_kospi[['date', 'kospi_close']], on='date', how='left')
df_stocks

Loaded cached data


Unnamed: 0,date,open,high,low,close,volume,BPS,PER,PBR,EPS,...,individual,foreign,institution_volume,other_corporation_volume,individual_volume,foreign_volume,shorting_volume,buy_volume,shorting_ratio,kospi_close
0,2020-01-02,55500,56000,55000,55200,12993228,35342,8.54,1.56,6461,...,152706355800,-29293793400,-2354766,126507,2756657,-528398,39485,12993228,0.30,2175.17
1,2020-01-03,56000,56600,54900,55500,15422255,35342,8.59,1.57,6461,...,66485169500,61978391100,-2228329,-91483,1199681,1120131,218704,15422255,1.42,2176.46
2,2020-01-06,54900,55600,54600,55500,10278951,35342,8.59,1.57,6461,...,23109555500,43009120100,-1199654,3796,418722,777136,167348,10278951,1.63,2155.07
3,2020-01-07,55700,56400,55600,55800,10009778,35342,8.64,1.58,6461,...,-722044800,-1833294600,51896,-7458,-10139,-34299,142717,10009778,1.43,2175.54
4,2020-01-08,56200,57400,55900,56800,23501171,35342,8.79,1.61,6461,...,-234778050700,242001549100,73413,-201951,-4130647,4259185,642430,23501171,2.73,2151.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1226,2024-12-23,53400,54000,53300,53500,13672650,52002,25.11,1.03,2131,...,-37622295300,-44488165300,464199,1062559,-696548,-830210,9083,13672650,0.07,2442.01
1227,2024-12-24,53700,54500,53600,54400,11634677,52002,25.53,1.05,2131,...,-168358856200,103093057000,64871,1140724,-3111933,1906338,56707,11634677,0.49,2440.52
1228,2024-12-26,54500,54600,53500,53600,10517075,52002,25.15,1.03,2131,...,13274611300,-73581477500,-125220,1242129,242434,-1359343,18123,10517075,0.17,2429.67
1229,2024-12-27,53500,54100,53200,53700,10747196,52002,25.20,1.03,2131,...,-39507459700,-2171267300,-261276,1037984,-733514,-43194,25363,10747196,0.24,2404.77


In [270]:
# Define FRED series IDs to fetch
fred_series = {
    'DEXKOUS': 'exchange_rate', # KRW/USD Exchange Rate
    'DGS10': 'us_10y_yield'     # 10-Year Treasury Constant Maturity Rate
}

df_stock['date'] = pd.to_datetime(df_stock['date'])

for series_id, col_name in fred_series.items():
    print(f"Fetching {col_name} ({series_id}) from FRED...")
    df_fred = get_fred_data(series_id, start_date, end_date)
    df_fred.rename(columns={series_id: col_name}, inplace=True)
    df_fred['date'] = pd.to_datetime(df_fred['date'])
    df_stock = pd.merge(df_stock, df_fred, on='date', how='left')

# Forward-fill missing values (for weekends/holidays) and then back-fill
df_stock.ffill(inplace=True)
df_stock.bfill(inplace=True)

print("\nFRED data merged successfully.")
df_stock.head()

Fetching exchange_rate (DEXKOUS) from FRED...
Loaded cached data
Fetching us_10y_yield (DGS10) from FRED...
Loaded cached data

FRED data merged successfully.


Unnamed: 0,date,open,high,low,close,volume,BPS,PER,PBR,EPS,...,institution_volume,other_corporation_volume,individual_volume,foreign_volume,shorting_volume,buy_volume,shorting_ratio,kospi_close,exchange_rate,us_10y_yield
0,2020-01-02,55500,56000,55000,55200,12993228,35342,8.54,1.56,6461,...,-2354766,126507,2756657,-528398,39485,12993228,0.3,2175.17,1157.95,1.88
1,2020-01-03,56000,56600,54900,55500,15422255,35342,8.59,1.57,6461,...,-2228329,-91483,1199681,1120131,218704,15422255,1.42,2176.46,1165.15,1.8
2,2020-01-06,54900,55600,54600,55500,10278951,35342,8.59,1.57,6461,...,-1199654,3796,418722,777136,167348,10278951,1.63,2155.07,1167.49,1.81
3,2020-01-07,55700,56400,55600,55800,10009778,35342,8.64,1.58,6461,...,51896,-7458,-10139,-34299,142717,10009778,1.43,2175.54,1166.21,1.83
4,2020-01-08,56200,57400,55900,56800,23501171,35342,8.79,1.61,6461,...,73413,-201951,-4130647,4259185,642430,23501171,2.73,2151.31,1170.61,1.87


In [271]:
# Final stock data with other metrics
df_stock

Unnamed: 0,date,open,high,low,close,volume,BPS,PER,PBR,EPS,...,institution_volume,other_corporation_volume,individual_volume,foreign_volume,shorting_volume,buy_volume,shorting_ratio,kospi_close,exchange_rate,us_10y_yield
0,2020-01-02,55500,56000,55000,55200,12993228,35342,8.54,1.56,6461,...,-2354766,126507,2756657,-528398,39485,12993228,0.30,2175.17,1157.95,1.88
1,2020-01-03,56000,56600,54900,55500,15422255,35342,8.59,1.57,6461,...,-2228329,-91483,1199681,1120131,218704,15422255,1.42,2176.46,1165.15,1.80
2,2020-01-06,54900,55600,54600,55500,10278951,35342,8.59,1.57,6461,...,-1199654,3796,418722,777136,167348,10278951,1.63,2155.07,1167.49,1.81
3,2020-01-07,55700,56400,55600,55800,10009778,35342,8.64,1.58,6461,...,51896,-7458,-10139,-34299,142717,10009778,1.43,2175.54,1166.21,1.83
4,2020-01-08,56200,57400,55900,56800,23501171,35342,8.79,1.61,6461,...,73413,-201951,-4130647,4259185,642430,23501171,2.73,2151.31,1170.61,1.87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1226,2024-12-23,53400,54000,53300,53500,13672650,52002,25.11,1.03,2131,...,464199,1062559,-696548,-830210,9083,13672650,0.07,2442.01,1451.76,4.59
1227,2024-12-24,53700,54500,53600,54400,11634677,52002,25.53,1.05,2131,...,64871,1140724,-3111933,1906338,56707,11634677,0.49,2440.52,1457.36,4.59
1228,2024-12-26,54500,54600,53500,53600,10517075,52002,25.15,1.03,2131,...,-125220,1242129,242434,-1359343,18123,10517075,0.17,2429.67,1467.66,4.58
1229,2024-12-27,53500,54100,53200,53700,10747196,52002,25.20,1.03,2131,...,-261276,1037984,-733514,-43194,25363,10747196,0.24,2404.77,1470.40,4.62


### Add target column

In [272]:
# Calculate the future close price as the rolling mean of the next n days
n=5
df_stock['future_close'] = df_stock['close'].rolling(window=n).mean().shift(-n)
df_stock['target_return'] = (df_stock['future_close'] - df_stock['close']) / df_stock['close']

threshold = 0.02  # threshold for rise/drop classification
conditions = [
    df_stock['target_return'] < -threshold,  # below -threshold is considered a drop
    (df_stock['target_return'] >= -threshold) & (df_stock['target_return'] <= threshold), # neutral
    df_stock['target_return'] > threshold  # over +threshold is considered a rise
]

choices = [0, 1, 2]
df_stock['target'] = np.select(conditions, choices, default=1)

df_stock.dropna(subset=['target'], inplace=True)
df_stock.drop(columns=['future_close'], inplace=True)
df_stock.drop(columns=['target_return'], inplace=True)

print("Shape of the final dataframe:", df_stock.shape)
print("Columns:", df_stock.columns)
df_stock

Shape of the final dataframe: (1231, 27)
Columns: Index(['date', 'open', 'high', 'low', 'close', 'volume', 'BPS', 'PER', 'PBR',
       'EPS', 'DIV', 'DPS', 'institution', 'other_corporation', 'individual',
       'foreign', 'institution_volume', 'other_corporation_volume',
       'individual_volume', 'foreign_volume', 'shorting_volume', 'buy_volume',
       'shorting_ratio', 'kospi_close', 'exchange_rate', 'us_10y_yield',
       'target'],
      dtype='object')


Unnamed: 0,date,open,high,low,close,volume,BPS,PER,PBR,EPS,...,other_corporation_volume,individual_volume,foreign_volume,shorting_volume,buy_volume,shorting_ratio,kospi_close,exchange_rate,us_10y_yield,target
0,2020-01-02,55500,56000,55000,55200,12993228,35342,8.54,1.56,6461,...,126507,2756657,-528398,39485,12993228,0.30,2175.17,1157.95,1.88,2
1,2020-01-03,56000,56600,54900,55500,15422255,35342,8.59,1.57,6461,...,-91483,1199681,1120131,218704,15422255,1.42,2176.46,1165.15,1.80,2
2,2020-01-06,54900,55600,54600,55500,10278951,35342,8.59,1.57,6461,...,3796,418722,777136,167348,10278951,1.63,2155.07,1167.49,1.81,2
3,2020-01-07,55700,56400,55600,55800,10009778,35342,8.64,1.58,6461,...,-7458,-10139,-34299,142717,10009778,1.43,2175.54,1166.21,1.83,2
4,2020-01-08,56200,57400,55900,56800,23501171,35342,8.79,1.61,6461,...,-201951,-4130647,4259185,642430,23501171,2.73,2151.31,1170.61,1.87,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1226,2024-12-23,53400,54000,53300,53500,13672650,52002,25.11,1.03,2131,...,1062559,-696548,-830210,9083,13672650,0.07,2442.01,1451.76,4.59,1
1227,2024-12-24,53700,54500,53600,54400,11634677,52002,25.53,1.05,2131,...,1140724,-3111933,1906338,56707,11634677,0.49,2440.52,1457.36,4.59,1
1228,2024-12-26,54500,54600,53500,53600,10517075,52002,25.15,1.03,2131,...,1242129,242434,-1359343,18123,10517075,0.17,2429.67,1467.66,4.58,1
1229,2024-12-27,53500,54100,53200,53700,10747196,52002,25.20,1.03,2131,...,1037984,-733514,-43194,25363,10747196,0.24,2404.77,1470.40,4.62,1


### Get news

In [273]:
from gnews import GNews

days_for_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
def get_news(name, start_date):
    google_news = GNews()
    google_news.max_results = 10
    google_news.language = 'ko'
    google_news.country = 'KR'

    # Set end date as one day after the start date
    end_date = list(start_date)
    end_date[2] += 1
    if end_date[2] > days_for_month[end_date[1] - 1]:
        end_date[2] = 1
        end_date[1] += 1
    if end_date[1] > 12:
        end_date[1] = 1
        end_date[0] += 1


    google_news.end_date = tuple(end_date)
    google_news.start_date = start_date
    try:
        news = google_news.get_news(name)
    except Exception:
        news = None
    if news is None or not isinstance(news, list):
        return []
    return [item['title'] for item in news if item['title']]

In [274]:
import time
from threading import Thread, Lock
from queue import Queue, Empty

def get_news_threaded(thread_count, start_dates):
    news_list = []
    news_dict = {}
    q = Queue()
    lock = Lock()
    count = 0

    def add_news(index,start_date):
        nonlocal count
        news_titles = get_news(TARGET_TICKER, tuple(start_date))
        q.put((index, news_titles))
        with lock:
            count -= 1

    last = time.time()
    for index, start_date in enumerate(start_dates):
        while count >= thread_count:
            try:
                get_index, news_titles = q.get_nowait()
                news_dict[get_index] = news_titles
            except Empty:
                time.sleep(0.1)
        thread = Thread(target=add_news, args=(index, start_date,))
        thread.start()
        with lock:
            count += 1
        if time.time() - last > 1:
            print(f"Processing: {len(news_dict)}/{len(start_dates)} (thread: {count}/{thread_count})", end='\r')
            last = time.time()

    last = time.time()
    while count > 0:
        index, news_titles = q.get()
        news_dict[index] = news_titles
        if time.time() - last > 1:
            print(f"Processing: {len(news_dict)}/{len(start_dates)} (thread: {count}/{thread_count})", end='\r')
            last = time.time()

    keys = sorted(news_dict.keys())
    for key in keys:
        news_list.append(news_dict[key])
    
    if len(news_dict) != len(start_dates):
        raise ValueError(f"News data count mismatch: {len(news_dict)} vs {len(start_dates)}")
    
    print(f"Processing complete: {len(news_list)}/{len(start_dates)}" + " "*30, end='\r')

    return news_list

In [275]:
def try_refill_na_news():
    # Find empty news data
    error_count = 0
    for i, row in df_stock.iterrows():
        if len(row['news']) == 0:
            print(f"Empty news data found: {str(row['date']).split()[0]}")
            error_count += 1
    
    if error_count > 0:
        print(f"Total {error_count} empty news data found. Collecting additional news data...")
        thread_count = 32
        start_dates = [list(map(int, row['date'].strftime('%Y %m %d').split(" "))) for _, row in df_stock.iterrows() if len(row['news']) == 0]
    
        news_list = get_news_threaded(thread_count, start_dates)
    
        for i, row in df_stock.iterrows():
            if len(row['news']) == 0:
                df_stock.at[i, 'news'] = news_list.pop(0)

In [276]:
def save_news_data():
    # save the final news_list to json
    # data key - news value -> dict[str, str]
    news_dict = {str(row['date']).split()[0]: row['news'] for _, row in df_stock.iterrows()}
    with open(f"{ticker_code}_news.json", 'w', encoding='utf-8') as f:
        import json
        json.dump(news_dict, f, ensure_ascii=False, indent=4)

In [277]:
# Merge news data with stock data
thread_count = 32
start_dates = [list(map(int, row['date'].strftime('%Y %m %d').split(" "))) for _, row in df_stock.iterrows()]

# check if news_list already exists
try:
    with open(f"{ticker_code}_news.json", 'r', encoding='utf-8') as f:
        import json
        news_dict = json.load(f)
    print(f"Existing news data loaded: {len(news_dict)} items")
    news_list = [news_dict.get(str(row['date']).split()[0], "") for _, row in df_stock.iterrows()]
except FileNotFoundError:
    print(f"Existing news data not found, collecting new data...")

    # test before running the full crawl
    print("Testing news collection with a few dates...")
    test_res = get_news_threaded(16, [(2024, 12, 26), (2025, 1, 2), (2025, 1, 3), (2025, 1, 4), (2025, 1, 5)])
    assert len(test_res) == 5, "Test news collection failed, please check the GNews API or network connection."

    # Collect news data using threading
    print("Collecting news data...")
    news_list = get_news_threaded(thread_count, start_dates)
    
df_stock['news'] = news_list
try_refill_na_news()
save_news_data()

Existing news data loaded: 1226 items
Empty news data found: 2024-12-23
Empty news data found: 2024-12-24
Empty news data found: 2024-12-26
Empty news data found: 2024-12-27
Empty news data found: 2024-12-30
Total 5 empty news data found. Collecting additional news data...
Processing complete: 5/5                              

In [278]:
df_stock

Unnamed: 0,date,open,high,low,close,volume,BPS,PER,PBR,EPS,...,individual_volume,foreign_volume,shorting_volume,buy_volume,shorting_ratio,kospi_close,exchange_rate,us_10y_yield,target,news
0,2020-01-02,55500,56000,55000,55200,12993228,35342,8.54,1.56,6461,...,2756657,-528398,39485,12993228,0.30,2175.17,1157.95,1.88,2,"[삼성전자, CES2020서 게이밍 모니터 ‘오디세이’ 신모델 첫 공개 - Sams..."
1,2020-01-03,56000,56600,54900,55500,15422255,35342,8.59,1.57,6461,...,1199681,1120131,218704,15422255,1.42,2176.46,1165.15,1.80,2,"[삼성전자, CES2020서 게이밍 모니터 ‘오디세이’ 신모델 첫 공개 - Sams..."
2,2020-01-06,54900,55600,54600,55500,10278951,35342,8.59,1.57,6461,...,418722,777136,167348,10278951,1.63,2155.07,1167.49,1.81,2,[삼성전자가 열어갈 미래는? CES 2020 키노트 요약정리 - Samsung Ne...
3,2020-01-07,55700,56400,55600,55800,10009778,35342,8.64,1.58,6461,...,-10139,-34299,142717,10009778,1.43,2175.54,1166.21,1.83,2,[삼성전자가 열어갈 미래는? CES 2020 키노트 요약정리 - Samsung Ne...
4,2020-01-08,56200,57400,55900,56800,23501171,35342,8.79,1.61,6461,...,-4130647,4259185,642430,23501171,2.73,2151.31,1170.61,1.87,2,[“미래에서 온 게이밍 모니터” 삼성 ‘오디세이’ 디자인 스토리 - Samsung ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1226,2024-12-23,53400,54000,53300,53500,13672650,52002,25.11,1.03,2131,...,-696548,-830210,9083,13672650,0.07,2442.01,1451.76,4.59,1,"[삼성전자, 미국 반도체 보조금 7조원 받는다 - 블로터, SK하이닉스, 내년엔 삼..."
1227,2024-12-24,53700,54500,53600,54400,11634677,52002,25.53,1.05,2131,...,-3111933,1906338,56707,11634677,0.49,2440.52,1457.36,4.59,1,"[삼성전자, CES 2025서 ‘가정용 히트펌프 EHS’ 美 시장에 첫 선봬 - s..."
1228,2024-12-26,54500,54600,53500,53600,10517075,52002,25.15,1.03,2131,...,242434,-1359343,18123,10517075,0.17,2429.67,1467.66,4.58,1,"[삼성전자 360조 투자 '용인 반도체 국가산단' 2026년 착공 - 중부일보, 삼..."
1229,2024-12-27,53500,54100,53200,53700,10747196,52002,25.20,1.03,2131,...,-733514,-43194,25363,10747196,0.24,2404.77,1470.40,4.62,1,"[삼성전자, 월급 2배 주더니 '연봉 절반' 또 쏜다…파격 성과급 - 한국경제, 삼..."


### Analyze Sentiment of news

In [279]:
import numpy as np
from transformers.pipelines import pipeline

sentiment_analyzer = None
sentiment_mapping = {
    'Very Positive': 2,
    'Positive': 1,
    'Neutral': 0,
    'Negative': -1,
    'Very Negative': -2
}

def get_sentiments(news_list) -> list[float]:
    global sentiment_analyzer
    if sentiment_analyzer is None:
        # Load the sentiment analysis model
        sentiment_analyzer = pipeline('sentiment-analysis', model='tabularisai/multilingual-sentiment-analysis')
    
    sentiments = []
    last = time.time()
    for news in news_list:
        if not news or len(news) == 0:
            sentiments.append(0.0)
            continue

        daily_sentiments = []
        for per_news in news:
            if not isinstance(per_news, str) or len(per_news) == 0:
                continue

            if "Samsung Newsroom" in per_news or "딜사이트" in per_news:
                continue

            result = sentiment_analyzer(per_news[:512]) # Limit input to 512 characters
            if result is None:
                raise ValueError("Sentiment analysis result is None. Please check if the model is loaded correctly.")
            if not isinstance(result, list) or len(result) == 0:
                raise ValueError("Sentiment analysis result is empty. Please check if the input data is correct.")

            label = result[0]['label']
            score = result[0]['score']
            
            weighted_score = sentiment_mapping.get(label, 0) * score
            daily_sentiments.append(weighted_score)
            if time.time() - last > 1:
                print(f"Processing: {len(sentiments)}/{len(df_stock)} | News content: {per_news[:30]}...", end='\r')
                last = time.time()
        sentiments.append(np.mean(daily_sentiments) if daily_sentiments else 0)
    print(f"Processing complete: {len(sentiments)}/{len(df_stock)}" + " "*30, end='\r')
    
    return sentiments

In [280]:
def save_sentiment_data():
    # save the final sentiment_list to json
    # data key - sentiment value -> dict[str, str]
    sentiment_dict = {str(row['date']).split()[0]: row['sentiment'] for _, row in df_stock.iterrows()}
    with open(f"{ticker_code}_sentiment.json", 'w', encoding='utf-8') as f:
        import json
        json.dump(sentiment_dict, f, ensure_ascii=False, indent=4)

In [281]:
try:
    sentiment_list = []
    with open(f"{ticker_code}_sentiment.json", 'r', encoding='utf-8') as f:
        import json
        sentiment_dict = json.load(f)
    print(f"Existing sentiment data loaded: {len(sentiment_dict)} items")
    for date in df_stock['date']:
        date_str = str(date).split()[0]
        if date_str in sentiment_dict:
            sentiment_list.append(float(sentiment_dict[date_str]))
        else:
            print(f"Missing sentiment data for date: {date_str}, collecting new data...")
            news = df_stock[df_stock['date'] == date]['news'].values[0]
            sentiment = get_sentiments([news])[0]
            sentiment_list.append(sentiment)
except FileNotFoundError:
    print(f"Existing sentiment data not found, collecting new data...")

    # Collect sentiment data using threading
    print("Collecting sentiment data...")
    sentiment_list = get_sentiments(df_stock['news'])
    
df_stock['sentiment'] = sentiment_list
save_sentiment_data()

print("\n--- Final dataset with sentiment analysis ---")

Existing sentiment data loaded: 1226 items
Missing sentiment data for date: 2024-12-23, collecting new data...


Device set to use xpu:0


Missing sentiment data for date: 2024-12-24, collecting new data....
Missing sentiment data for date: 2024-12-26, collecting new data...
Missing sentiment data for date: 2024-12-27, collecting new data...
Missing sentiment data for date: 2024-12-30, collecting new data...
Processing complete: 1/1231                              
--- Final dataset with sentiment analysis ---


In [282]:
# remove news column
df_stock.drop(columns=['news'], inplace=True)
df_stock

Unnamed: 0,date,open,high,low,close,volume,BPS,PER,PBR,EPS,...,individual_volume,foreign_volume,shorting_volume,buy_volume,shorting_ratio,kospi_close,exchange_rate,us_10y_yield,target,sentiment
0,2020-01-02,55500,56000,55000,55200,12993228,35342,8.54,1.56,6461,...,2756657,-528398,39485,12993228,0.30,2175.17,1157.95,1.88,2,0.229161
1,2020-01-03,56000,56600,54900,55500,15422255,35342,8.59,1.57,6461,...,1199681,1120131,218704,15422255,1.42,2176.46,1165.15,1.80,2,0.418705
2,2020-01-06,54900,55600,54600,55500,10278951,35342,8.59,1.57,6461,...,418722,777136,167348,10278951,1.63,2155.07,1167.49,1.81,2,0.489122
3,2020-01-07,55700,56400,55600,55800,10009778,35342,8.64,1.58,6461,...,-10139,-34299,142717,10009778,1.43,2175.54,1166.21,1.83,2,0.385063
4,2020-01-08,56200,57400,55900,56800,23501171,35342,8.79,1.61,6461,...,-4130647,4259185,642430,23501171,2.73,2151.31,1170.61,1.87,2,0.398799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1226,2024-12-23,53400,54000,53300,53500,13672650,52002,25.11,1.03,2131,...,-696548,-830210,9083,13672650,0.07,2442.01,1451.76,4.59,1,-0.073735
1227,2024-12-24,53700,54500,53600,54400,11634677,52002,25.53,1.05,2131,...,-3111933,1906338,56707,11634677,0.49,2440.52,1457.36,4.59,1,0.064583
1228,2024-12-26,54500,54600,53500,53600,10517075,52002,25.15,1.03,2131,...,242434,-1359343,18123,10517075,0.17,2429.67,1467.66,4.58,1,-0.077989
1229,2024-12-27,53500,54100,53200,53700,10747196,52002,25.20,1.03,2131,...,-733514,-43194,25363,10747196,0.24,2404.77,1470.40,4.62,1,0.252955


### Add technical features

In [283]:
# from ta import add_all_ta_features

# df_stock = add_all_ta_features(df_stock.copy(), open="open", high="high", low="low", close="close", volume="volume", fillna=True)

In [284]:
from ta.trend import MACD
from ta.momentum import RSIIndicator
from ta.volatility import BollingerBands
from ta.volume import OnBalanceVolumeIndicator

# Make a clean copy to add features to
df_with_ta = df_stock.copy()

print("📊 Adding selected technical indicators...")

# 1. Moving Average Convergence Divergence (MACD)
indicator_macd = MACD(close=df_with_ta['close'], window_slow=26, window_fast=12, window_sign=9, fillna=True)
df_with_ta['macd'] = indicator_macd.macd()
df_with_ta['macd_signal'] = indicator_macd.macd_signal()

# 2. Relative Strength Index (RSI)
df_with_ta['rsi'] = RSIIndicator(close=df_with_ta['close'], window=14, fillna=True).rsi()

# 3. Bollinger Bands (BB)
indicator_bb = BollingerBands(close=df_with_ta['close'], window=20, window_dev=2, fillna=True)
df_with_ta['bb_hband'] = indicator_bb.bollinger_hband() # Upper band
df_with_ta['bb_lband'] = indicator_bb.bollinger_lband() # Lower band
df_with_ta['bb_pband'] = indicator_bb.bollinger_pband() # Percentage band
df_with_ta['bb_wband'] = indicator_bb.bollinger_wband() # Width band

# 4. On-Balance Volume (OBV)
df_with_ta['obv'] = OnBalanceVolumeIndicator(close=df_with_ta['close'], volume=df_with_ta['volume'], fillna=True).on_balance_volume()

# 5. Rolling Standard Deviation
df_with_ta['volatility'] = df_with_ta['close'].pct_change().rolling(window=20).std().fillna(0)

# The original df_stock is now the one with the new, selected features
df_stock = df_with_ta

# Calculate the difference between current and previous values for selected features
features_to_diff = ['close', 'open', 'high', 'low', 'volume', 'kospi_close', 'exchange_rate']
for col in features_to_diff:
    df_stock[f'{col}_diff'] = df_stock[col].diff().fillna(0)

# drop the original columns that were used to calculate the differences
df_stock.drop(columns=features_to_diff, inplace=True)

# Drop first row
df_stock.drop(index=df_stock.index[0], inplace=True)

df_stock

📊 Adding selected technical indicators...


Unnamed: 0,date,BPS,PER,PBR,EPS,DIV,DPS,institution,other_corporation,individual,...,bb_wband,obv,volatility,close_diff,open_diff,high_diff,low_diff,volume_diff,kospi_close_diff,exchange_rate_diff
1,2020-01-03,35342,8.59,1.57,6461,2.55,1416,-123332624300,-5130936300,66485169500,...,1.084011,28415483,0.000000,300.0,500.0,600.0,-100.0,2429027.0,1.29,7.20
2,2020-01-06,35342,8.59,1.57,6461,2.55,1416,-66328642600,209967000,23109555500,...,1.021093,38694434,0.000000,0.0,-1100.0,-1000.0,-300.0,-5143304.0,-21.39,2.34
3,2020-01-07,35342,8.64,1.58,6461,2.54,1416,2971900200,-416560800,-722044800,...,1.528880,48704212,0.000000,300.0,800.0,800.0,1000.0,-269173.0,20.47,-1.28
4,2020-01-08,35342,8.79,1.61,6461,2.49,1416,4221083100,-11444581500,-234778050700,...,3.970833,72205383,0.000000,1000.0,500.0,1000.0,300.0,13491393.0,-24.23,4.40
5,2020-01-09,35342,9.07,1.66,6461,2.42,1416,-135036047800,-7772851300,-72444657000,...,8.342673,96307962,0.000000,1800.0,2200.0,1200.0,1500.0,601408.0,35.14,-11.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1226,2024-12-23,52002,25.11,1.03,2131,2.70,1444,24961655300,57148805300,-37622295300,...,9.852768,338601556,0.017277,500.0,700.0,900.0,1400.0,-11002124.0,37.86,7.24
1227,2024-12-24,52002,25.53,1.05,2131,2.65,1444,3599664600,61666134600,-168358856200,...,7.528658,350236233,0.017736,900.0,300.0,500.0,300.0,-2037973.0,-1.49,5.60
1228,2024-12-26,52002,25.15,1.03,2131,2.69,1444,-6930760500,67237626700,13274611300,...,6.822731,339719158,0.016426,-800.0,800.0,100.0,-100.0,-1117602.0,-10.85,10.30
1229,2024-12-27,52002,25.20,1.03,2131,2.69,1444,-13922536000,55601263000,-39507459700,...,6.486702,350466354,0.016206,100.0,-1000.0,-500.0,-300.0,230121.0,-24.90,2.74


### Split into train, val, test

In [285]:
from sklearn.model_selection import train_test_split

# Split the df_stock into training, validation, and test sets (60% train, 20% validation, 20% test)
train_val_df, test_df = train_test_split(
    df_stock, test_size=0.2, shuffle=False
)
train_df, val_df = train_test_split(
    train_val_df, test_size=0.25, shuffle=False  # 0.8 * 0.25 = 0.2
)

train_df

Unnamed: 0,date,BPS,PER,PBR,EPS,DIV,DPS,institution,other_corporation,individual,...,bb_wband,obv,volatility,close_diff,open_diff,high_diff,low_diff,volume_diff,kospi_close_diff,exchange_rate_diff
1,2020-01-03,35342,8.59,1.57,6461,2.55,1416,-123332624300,-5130936300,66485169500,...,1.084011,28415483,0.000000,300.0,500.0,600.0,-100.0,2429027.0,1.29,7.20
2,2020-01-06,35342,8.59,1.57,6461,2.55,1416,-66328642600,209967000,23109555500,...,1.021093,38694434,0.000000,0.0,-1100.0,-1000.0,-300.0,-5143304.0,-21.39,2.34
3,2020-01-07,35342,8.64,1.58,6461,2.54,1416,2971900200,-416560800,-722044800,...,1.528880,48704212,0.000000,300.0,800.0,800.0,1000.0,-269173.0,20.47,-1.28
4,2020-01-08,35342,8.79,1.61,6461,2.49,1416,4221083100,-11444581500,-234778050700,...,3.970833,72205383,0.000000,1000.0,500.0,1000.0,300.0,13491393.0,-24.23,4.40
5,2020-01-09,35342,9.07,1.66,6461,2.42,1416,-135036047800,-7772851300,-72444657000,...,8.342673,96307962,0.000000,1800.0,2200.0,1200.0,1500.0,601408.0,35.14,-11.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
734,2022-12-20,43611,10.14,1.34,5777,2.46,1444,11974313900,712370800,77688519200,...,6.907278,133715861,0.014840,-900.0,-500.0,-800.0,-600.0,1588574.0,-18.88,-11.88
735,2022-12-21,43611,10.04,1.33,5777,2.49,1444,14111417100,-1917309700,86343338000,...,7.496895,123358890,0.014830,-600.0,-300.0,0.0,-500.0,1072210.0,-4.34,-5.71
736,2022-12-22,43611,10.23,1.36,5777,2.44,1444,148819102500,-10368534400,-88344697700,...,7.330057,134079520,0.015471,1100.0,-600.0,0.0,100.0,363659.0,27.78,-9.02
737,2022-12-23,43611,10.06,1.33,5777,2.49,1444,23550966200,-5667892400,3946153800,...,7.614629,124250113,0.015809,-1000.0,100.0,-700.0,-400.0,-891223.0,-43.04,4.34


### Scailing

In [286]:
# Scale the features
from sklearn.preprocessing import StandardScaler

features = [col for col in train_df.columns if col not in ['date', 'target', 'target_label']]

feature_scaler = StandardScaler()
feature_scaler.fit(train_df[features])
train_df[features] = feature_scaler.transform(train_df[features])
val_df[features] = feature_scaler.transform(val_df[features])
test_df[features] = feature_scaler.transform(test_df[features])

### Create batches sliding window

In [287]:
# Create sliding windows for the time series data
def create_sliding_windows(data: pd.DataFrame, sequence_length=10) -> list[pd.DataFrame]:
    results = []
    for i in range(len(data) - sequence_length):
        results.append(data.iloc[i:i + sequence_length].copy())
    return results

SEQ_LENGTH = 10 # Length of the sliding window
# Create sliding windows for the training, validation, and test sets
train = create_sliding_windows(train_df, SEQ_LENGTH)
val = create_sliding_windows(val_df, SEQ_LENGTH)
test = create_sliding_windows(test_df, SEQ_LENGTH)

train[0]

Unnamed: 0,date,BPS,PER,PBR,EPS,DIV,DPS,institution,other_corporation,individual,...,bb_wband,obv,volatility,close_diff,open_diff,high_diff,low_diff,volume_diff,kospi_close_diff,exchange_rate_diff
1,2020-01-03,-1.492672,-1.454364,-0.453668,1.742894,-0.403847,-0.719575,-0.544736,-0.726504,-0.034681,...,-1.50827,-1.3161,-2.3779,0.281195,0.42599,0.581054,-0.108407,0.370008,0.034399,1.044825
2,2020-01-06,-1.492672,-1.454364,-0.453668,1.742894,-0.403847,-0.719575,-0.183936,-0.187811,-0.185753,...,-1.518574,-1.274902,-2.3779,-0.003472,-0.946476,-0.975808,-0.317567,-0.779468,-0.676374,0.322985
3,2020-01-07,-1.492672,-1.445567,-0.421,1.742894,-0.415647,-0.719575,0.254694,-0.251004,-0.268756,...,-1.43542,-1.234783,-2.3779,0.281195,0.683327,0.775662,1.041978,-0.039577,0.635484,-0.214681
4,2020-01-08,-1.492672,-1.419176,-0.322999,1.742894,-0.474651,-0.719575,0.262601,-1.363309,-1.083945,...,-1.03553,-1.14059,-2.3779,0.945417,0.42599,0.97027,0.309915,2.049268,-0.765377,0.62895
5,2020-01-09,-1.492672,-1.369913,-0.159662,1.742894,-0.557256,-0.719575,-0.618811,-0.992972,-0.518557,...,-0.319606,-1.043987,-2.3779,1.704528,1.884235,1.164878,1.56488,0.092576,1.09523,-1.803917
6,2020-01-10,-1.492672,-1.345281,-0.094328,1.742894,-0.604459,-0.719575,-0.869667,-0.286449,0.008231,...,0.135613,-0.979858,-2.3779,0.850528,0.340211,1.067574,0.937397,-1.228658,0.618874,0.354176
7,2020-01-13,-1.492672,-1.331206,-0.028993,1.742894,-0.62806,-0.719575,-0.335864,-0.155116,-0.25001,...,0.41845,-0.93433,-2.3779,0.470973,0.683327,0.289143,0.832817,-0.703223,0.710698,-0.756804
8,2020-01-14,-1.492672,-1.331206,-0.028993,1.742894,-0.62806,-0.719575,-0.403183,1.776537,-0.055518,...,0.54202,-0.86657,-2.3779,-0.003472,0.683327,0.97027,0.832817,0.843338,0.295454,-0.020111
9,2020-01-15,-1.492672,-1.359356,-0.126995,1.742894,-0.580857,-0.719575,0.326358,0.187592,0.283233,...,0.488683,-0.923888,-2.3779,-0.95236,-0.774918,-1.365023,-1.04963,-0.39421,-0.253608,0.058608
10,2020-01-16,-1.492672,-1.313612,0.036341,1.742894,-0.663462,-0.719575,-0.01784,-0.151342,-0.366366,...,0.61217,-0.866246,-2.3779,1.609639,-0.346022,1.067574,0.100754,0.013555,0.528931,0.57251


## Save

In [288]:
# Save the train, val, and test sets (list of DataFrames) to parquet files
# Since each set is a list of DataFrames (sliding windows), we'll save them separately
# and add metadata to reconstruct the sliding windows

import os
import pandas as pd
import json

# Create a directory for the dataset if it doesn't exist
dataset_dir = f"{ticker_code}_dataset"
os.makedirs(dataset_dir, exist_ok=True)

# Save train set
train_combined = pd.concat([df.assign(window_id=i) for i, df in enumerate(train)], ignore_index=True)
train_combined.to_parquet(f"{dataset_dir}/train.parquet", index=False)

# Save validation set
val_combined = pd.concat([df.assign(window_id=i) for i, df in enumerate(val)], ignore_index=True)
val_combined.to_parquet(f"{dataset_dir}/val.parquet", index=False)

# Save test set
test_combined = pd.concat([df.assign(window_id=i) for i, df in enumerate(test)], ignore_index=True)
test_combined.to_parquet(f"{dataset_dir}/test.parquet", index=False)

# Save metadata about the sliding windows
metadata = {
    "seq_length": SEQ_LENGTH,
    "train_windows": len(train),
    "val_windows": len(val),
    "test_windows": len(test),
    "ticker_code": ticker_code,
    "target_ticker": TARGET_TICKER
}

with open(f"{dataset_dir}/metadata.json", 'w', encoding="UTF-8") as f:
    json.dump(metadata, f, indent=4, ensure_ascii=False)

print(f"Dataset saved to {dataset_dir}/ directory")
print(f"Train windows: {len(train)}, Val windows: {len(val)}, Test windows: {len(test)}")
print("\n--- Dataset saved successfully ---")

Dataset saved to 005930_dataset/ directory
Train windows: 728, Val windows: 236, Test windows: 236

--- Dataset saved successfully ---


## Load (test)

In [289]:
# Load the train, val, and test sets (list of DataFrames) from parquet files
import os
import json
import pandas as pd

dataset_dir = f"{ticker_code}_dataset"
train = []
val = []
test = []

if os.path.exists(dataset_dir):
    print(f"Loading dataset from {dataset_dir}/ directory...")
    
    # Load metadata
    with open(f"{dataset_dir}/metadata.json", 'r') as f:
        metadata = json.load(f)
    
    SEQ_LENGTH = metadata["seq_length"]
    print(f"Sequence length: {SEQ_LENGTH}")
    print(f"Ticker: {metadata['target_ticker']} ({metadata['ticker_code']})")
    assert SEQ_LENGTH > 0, "Sequence length must be greater than 0"
    assert ticker_code == metadata['ticker_code'], "Ticker code mismatch in metadata"
    assert TARGET_TICKER == metadata['target_ticker'], "Target ticker mismatch in metadata"
    
    # Load and reconstruct train set
    train_combined = pd.read_parquet(f"{dataset_dir}/train.parquet")
    for window_id in range(metadata["train_windows"]):
        window_df = train_combined[train_combined['window_id'] == window_id].drop('window_id', axis=1).reset_index(drop=True)
        train.append(window_df)
    
    # Load and reconstruct validation set
    val_combined = pd.read_parquet(f"{dataset_dir}/val.parquet")
    for window_id in range(metadata["val_windows"]):
        window_df = val_combined[val_combined['window_id'] == window_id].drop('window_id', axis=1).reset_index(drop=True)
        val.append(window_df)
    
    # Load and reconstruct test set
    test_combined = pd.read_parquet(f"{dataset_dir}/test.parquet")
    for window_id in range(metadata["test_windows"]):
        window_df = test_combined[test_combined['window_id'] == window_id].drop('window_id', axis=1).reset_index(drop=True)
        test.append(window_df)
    
    print(f"Loaded - Train windows: {len(train)}, Val windows: {len(val)}, Test windows: {len(test)}")
    print("Dataset loaded successfully!")
else:
    print(f"Dataset directory {dataset_dir} not found. Please run the dataset creation cells first.")


Loading dataset from 005930_dataset/ directory...
Sequence length: 10
Ticker: 삼성전자 (005930)
Loaded - Train windows: 728, Val windows: 236, Test windows: 236
Dataset loaded successfully!
