# Init

In [116]:
# List of tickers for Korean stocks
tickers = {
    "삼성전자": "005930", "SK": "034730", "한화": "000880",
    "두산": "000150", "기아": "000270", "현대차": "005380",
    "LG": "003550", "NAVER": "035420", "카카오": "035720", "롯데지주": "004990"
}

# Date range for the stock data
start_date = "20200101"
end_date = "20250101"

In [None]:
# Target ticker for analysis
TARGET_TICKER = "삼성전자"
ticker_code = tickers[TARGET_TICKER]

# Create Dataset

In [118]:
from pykrx import stock

# Fetching stock data for the target ticker
df_stock = stock.get_market_ohlcv_by_date(start_date, end_date, ticker_code)
df_stock.reset_index(inplace=True)
df_stock.rename(columns={'날짜':'date', '종가':'close'}, inplace=True)

# Remove columns except 'date' and 'close'
df_stock = df_stock[['date', 'close']]

In [119]:
# TODO: Add Interest rates, Currency exchange rates (e.g., KRW/USD)
import pandas as pd
import yfinance as yf

# Fetching KOSPI index data
df_kospi = stock.get_index_ohlcv_by_date(start_date, end_date, "1001")
df_kospi.reset_index(inplace=True)
df_kospi.rename(columns={'날짜':'date', '종가':'kospi_close'}, inplace=True)

# Merging KOSPI data with stock data
df_stock = pd.merge(df_stock, df_kospi[['date', 'kospi_close']], on='date', how='left')

In [120]:
# # Define the target by shifting the next day's change to the current row
# df_stock['target'] = df_stock['close'].pct_change().shift(-1)

# Calculate the future close price as the rolling mean of the next n days
n=3
df_stock['future_close'] = df_stock['close'].rolling(window=n).mean().shift(-n)
df_stock['target'] = (df_stock['future_close'] - df_stock['close']) / df_stock['close']

# Define a threshold for classifying stock price changes
# 1% threshold for increase or decrease
threshold = 0.01

# Create labels based on the shifted target
# Default to 'Neutral'
df_stock['target_label'] = 1
# Set to 'Increase' if the next day's change is above the threshold
df_stock.loc[df_stock['target'] > threshold, 'target_label'] = 2
# Set to 'Decrease' if the next day's change is below the negative threshold
df_stock.loc[df_stock['target'] < -threshold, 'target_label'] = 0

# Drop rows with NaN values created by shifting, which is the last row
df_stock.dropna(subset=['target', 'target_label'], inplace=True)

# Convert target_label to integer type
df_stock['target_label'] = df_stock['target_label'].astype(int)

# Keep only the necessary columns for the final dataset
df_stock = df_stock[['date', 'close', 'kospi_close', 'target_label']].copy()

df_stock

Unnamed: 0,date,close,kospi_close,target_label
0,2020-01-02,55200,2175.17,1
1,2020-01-03,55500,2176.46,1
2,2020-01-06,55500,2155.07,2
3,2020-01-07,55800,2175.54,2
4,2020-01-08,56800,2151.31,2
...,...,...,...,...
1223,2024-12-18,54900,2484.43,0
1224,2024-12-19,53100,2435.93,2
1225,2024-12-20,53000,2404.15,2
1226,2024-12-23,53500,2442.01,1


In [121]:
from gnews import GNews

days_for_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

# News data collection
def get_news(name, start_date):
    google_news = GNews()
    google_news.max_results = 10
    google_news.language = 'ko'
    google_news.country = 'KR'

    end_date = list(start_date)
    end_date[2] += 1

    if end_date[2] > days_for_month[end_date[1] - 1]:
        end_date[2] = 1
        end_date[1] += 1
        
    if end_date[1] > 12:
        end_date[1] = 1
        end_date[0] += 1
    google_news.end_date = tuple(end_date)
    google_news.start_date = start_date
    try:
        news = google_news.get_news(name)
    except Exception:
        news = None
    if news is None or not isinstance(news, list):
        return []
    return [item['title'] for item in news if item['title']]

In [122]:
import time
from threading import Thread, Lock
from queue import Queue, Empty

def get_news_threaded(thread_count, start_dates):
    news_list = []
    news_dict = {}
    q = Queue()
    lock = Lock()
    count = 0

    def add_news(index,start_date):
        nonlocal count
        news_titles = get_news(TARGET_TICKER, tuple(start_date))
        q.put((index, news_titles))
        with lock:
            count -= 1

    last = time.time()
    for index, start_date in enumerate(start_dates):
        while count >= thread_count:
            try:
                get_index, news_titles = q.get_nowait()
                news_dict[get_index] = news_titles
            except Empty:
                time.sleep(0.1)
        thread = Thread(target=add_news, args=(index, start_date,))
        thread.start()
        with lock:
            count += 1
        if time.time() - last > 1:
            print(f"Processing: {len(news_dict)}/{len(start_dates)} (thread: {count}/{thread_count})", end='\r')
            last = time.time()

    last = time.time()
    while count > 0:
        index, news_titles = q.get()
        news_dict[index] = news_titles
        if time.time() - last > 1:
            print(f"Processing: {len(news_dict)}/{len(start_dates)} (thread: {count}/{thread_count})", end='\r')
            last = time.time()

    keys = sorted(news_dict.keys())
    for key in keys:
        news_list.append(news_dict[key])
    
    if len(news_dict) != len(start_dates):
        raise ValueError(f"News data count mismatch: {len(news_dict)} vs {len(start_dates)}")
    
    print(f"Processing complete: {len(news_list)}/{len(start_dates)}" + " "*30, end='\r')

    return news_list

In [124]:
# Merge news data with stock data

thread_count = 32
start_dates = [list(map(int, row['date'].strftime('%Y %m %d').split(" "))) for _, row in df_stock.iterrows()]

# check if news_list already exists
try:
    with open(f"{TARGET_TICKER}_news_final.json", 'r', encoding='utf-8') as f:
        import json
        news_dict = json.load(f)
    print(f"Existing news data loaded: {len(news_dict)} items")
    if len(news_dict) < len(start_dates):
        raise ValueError(f"News data count mismatch: {len(news_dict)} < {len(start_dates)}")
    # make news_list from news_dict
    # get the newws with date as key
    news_list = [news_dict.get(str(row['date']).split()[0], "") for _, row in df_stock.iterrows()]
    if len(news_list) != len(start_dates):
        raise ValueError(f"News data count mismatch: {len(news_list)} vs {len(start_dates)}")
except FileNotFoundError:
    print(f"Existing news data not found, collecting new data...")

    # test before running the full crawl
    print("Testing news collection with a few dates...")
    test_res = get_news_threaded(16, [(2024, 12, 26), (2025, 1, 2), (2025, 1, 3), (2025, 1, 4), (2025, 1, 5)])
    assert len(test_res) == 5, "Test news collection failed, please check the GNews API or network connection."

    # Collect news data using threading
    print("Collecting news data...")
    news_list = get_news_threaded(thread_count, start_dates)
    
df_stock['news'] = news_list

Existing news data loaded: 1228 items


In [125]:
# Find empty news data
error_count = 0
for i, row in df_stock.iterrows():
    if len(row['news']) == 0:
        print(f"Empty news data found: {str(row['date']).split()[0]}")
        error_count += 1

In [126]:
if error_count > 0:
    print(f"Total {error_count} empty news data found. Collecting additional news data...")
    thread_count = 32
    start_dates = [list(map(int, row['date'].strftime('%Y %m %d').split(" "))) for _, row in df_stock.iterrows() if len(row['news']) == 0]

    news_list = get_news_threaded(thread_count, start_dates)

    for i, row in df_stock.iterrows():
        if len(row['news']) == 0:
            df_stock.at[i, 'news'] = news_list.pop(0)

In [127]:
# save the final news_list to json
# data key - news value -> dict[str, str]
news_dict = {str(row['date']).split()[0]: row['news'] for _, row in df_stock.iterrows()}
with open(f"{TARGET_TICKER}_news_final.json", 'w', encoding='utf-8') as f:
    import json
    json.dump(news_dict, f, ensure_ascii=False, indent=4)

In [128]:
print("\n--- Final dataset with merged news data ---")
df_stock


--- Final dataset with merged news data ---


Unnamed: 0,date,close,kospi_close,target_label,news
0,2020-01-02,55200,2175.17,1,"[삼성전자, CES2020서 게이밍 모니터 ‘오디세이’ 신모델 첫 공개 - Sams..."
1,2020-01-03,55500,2176.46,1,"[삼성전자, CES2020서 게이밍 모니터 ‘오디세이’ 신모델 첫 공개 - Sams..."
2,2020-01-06,55500,2155.07,2,[삼성전자가 열어갈 미래는? CES 2020 키노트 요약정리 - Samsung Ne...
3,2020-01-07,55800,2175.54,2,[삼성전자가 열어갈 미래는? CES 2020 키노트 요약정리 - Samsung Ne...
4,2020-01-08,56800,2151.31,2,[“미래에서 온 게이밍 모니터” 삼성 ‘오디세이’ 디자인 스토리 - Samsung ...
...,...,...,...,...,...
1223,2024-12-18,54900,2484.43,0,"[삼성전자, CES 2025서 ‘AI 홈’ 탑재한 스크린 가전 대거 공개 - Sam..."
1224,2024-12-19,53100,2435.93,2,"[美, 삼성전자 보조금 최종 결정…26% 줄어든 47.5억 달러 지급 - 중앙일보,..."
1225,2024-12-20,53000,2404.15,2,"[美, 삼성전자 보조금 최종 결정…26% 줄어든 47.5억 달러 지급 - 중앙일보,..."
1226,2024-12-23,53500,2442.01,1,"[삼성전자, 미국 반도체 보조금 7조원 받는다 - 블로터, SK하이닉스, 내년엔 삼..."


In [129]:
print("\n--- Saving intermediate dataset ---")
df_stock.to_parquet(f"{ticker_code}_middle.parquet")


--- Saving intermediate dataset ---


In [130]:
import pandas as pd

print("\n--- Loading intermediate dataset ---")
df_stock = pd.read_parquet(f"{ticker_code}_middle.parquet")


--- Loading intermediate dataset ---


In [131]:
df_stock

Unnamed: 0,date,close,kospi_close,target_label,news
0,2020-01-02,55200,2175.17,1,"[삼성전자, CES2020서 게이밍 모니터 ‘오디세이’ 신모델 첫 공개 - Sams..."
1,2020-01-03,55500,2176.46,1,"[삼성전자, CES2020서 게이밍 모니터 ‘오디세이’ 신모델 첫 공개 - Sams..."
2,2020-01-06,55500,2155.07,2,[삼성전자가 열어갈 미래는? CES 2020 키노트 요약정리 - Samsung Ne...
3,2020-01-07,55800,2175.54,2,[삼성전자가 열어갈 미래는? CES 2020 키노트 요약정리 - Samsung Ne...
4,2020-01-08,56800,2151.31,2,[“미래에서 온 게이밍 모니터” 삼성 ‘오디세이’ 디자인 스토리 - Samsung ...
...,...,...,...,...,...
1223,2024-12-18,54900,2484.43,0,"[삼성전자, CES 2025서 ‘AI 홈’ 탑재한 스크린 가전 대거 공개 - Sam..."
1224,2024-12-19,53100,2435.93,2,"[美, 삼성전자 보조금 최종 결정…26% 줄어든 47.5억 달러 지급 - 중앙일보,..."
1225,2024-12-20,53000,2404.15,2,"[美, 삼성전자 보조금 최종 결정…26% 줄어든 47.5억 달러 지급 - 중앙일보,..."
1226,2024-12-23,53500,2442.01,1,"[삼성전자, 미국 반도체 보조금 7조원 받는다 - 블로터, SK하이닉스, 내년엔 삼..."


In [132]:
from transformers.pipelines import pipeline

# Feature Engineering
#
# Load sentiment analysis model (pre-trained model from Hugging Face)
# The sentiment analysis model may take some time to download on the first run.
sentiment_analyzer = pipeline('sentiment-analysis', model='tabularisai/multilingual-sentiment-analysis')

Device set to use xpu:0


In [133]:
# Test the sentiment analysis model
print(sentiment_analyzer("안녕하세요, 주식 시장은 오늘도 활발합니다."))
print(sentiment_analyzer("삼성 망했어요~"))

[{'label': 'Very Positive', 'score': 0.5859651565551758}]
[{'label': 'Very Negative', 'score': 0.5397356152534485}]


In [134]:
# Apply sentiment analysis to news data
import numpy as np

sentiment_mapping = {
    'Very Positive': 2,
    'Positive': 1,
    'Neutral': 0,
    'Negative': -1,
    'Very Negative': -2
}

# check if sentiment_analysis already exists
try:
    with open(f"{TARGET_TICKER}_sa.json", 'r', encoding='utf-8') as f:
        import json
        sentiments = json.load(f)
    print(f"Existing sentiment data loaded: {len(sentiments)} items")
    if len(sentiments) < len(df_stock):
        raise ValueError(f"Sentiment data count mismatch: {len(sentiments)} < {len(df_stock)}")
    
    # make sentiments from sentiments
    # get the sentiments with date as key
    sentiments = [sentiments.get(str(row['date']).split()[0], []) for _, row in df_stock.iterrows()]
    if len(sentiments) != len(df_stock):
        raise ValueError(f"Sentiment data count mismatch: {len(sentiments)} vs {len(df_stock)}")
except FileNotFoundError:
    print(f"Existing sentiment data not found, collecting new data...")
    sentiments = []
    last = time.time()
    for news in df_stock['news']:
        if len(news) != 0: # Analyze only if there is news
            daily_sentiments = []
            for per_news in news:
                if not isinstance(per_news, str) or len(per_news) == 0:
                    continue

                if "Samsung Newsroom" in per_news or "딜사이트" in per_news:
                    continue

                result = sentiment_analyzer(per_news[:512]) # Limit input to 512 characters
                if result is None:
                    raise ValueError("Sentiment analysis result is None. Please check if the model is loaded correctly.")
                if not isinstance(result, list) or len(result) == 0:
                    raise ValueError("Sentiment analysis result is empty. Please check if the input data is correct.")

                label = result[0]['label']
                score = result[0]['score']
                
                weighted_score = sentiment_mapping.get(label, 0) * score
                daily_sentiments.append(weighted_score)
                if time.time() - last > 1:
                    print(f"Processing: {len(sentiments)}/{len(df_stock)} | News content: {per_news[:30]}...", end='\r')
                    last = time.time()
            sentiments.append(np.mean(daily_sentiments) if daily_sentiments else 0)
        else:
            sentiments.append([])

    print(f"Processing complete: {len(sentiments)}/{len(df_stock)}" + " "*30, end='\r')

df_stock['sentiment'] = sentiments
# Save the final sentiment data to json
sentiment_dict = {str(row['date']).split()[0]: row['sentiment'] for _, row in df_stock.iterrows()}
with open(f"{TARGET_TICKER}_sa.json", 'w', encoding='utf-8') as f:
    import json
    json.dump(sentiment_dict, f, ensure_ascii=False, indent=4)
print("\n--- Final dataset with sentiment analysis ---")

Existing sentiment data loaded: 1228 items

--- Final dataset with sentiment analysis ---


In [135]:
print("\n--- Final dataset with sentiment ---")
df_stock


--- Final dataset with sentiment ---


Unnamed: 0,date,close,kospi_close,target_label,news,sentiment
0,2020-01-02,55200,2175.17,1,"[삼성전자, CES2020서 게이밍 모니터 ‘오디세이’ 신모델 첫 공개 - Sams...",0.257930
1,2020-01-03,55500,2176.46,1,"[삼성전자, CES2020서 게이밍 모니터 ‘오디세이’ 신모델 첫 공개 - Sams...",0.433907
2,2020-01-06,55500,2155.07,2,[삼성전자가 열어갈 미래는? CES 2020 키노트 요약정리 - Samsung Ne...,0.244561
3,2020-01-07,55800,2175.54,2,[삼성전자가 열어갈 미래는? CES 2020 키노트 요약정리 - Samsung Ne...,0.314477
4,2020-01-08,56800,2151.31,2,[“미래에서 온 게이밍 모니터” 삼성 ‘오디세이’ 디자인 스토리 - Samsung ...,0.496105
...,...,...,...,...,...,...
1223,2024-12-18,54900,2484.43,0,"[삼성전자, CES 2025서 ‘AI 홈’ 탑재한 스크린 가전 대거 공개 - Sam...",0.109803
1224,2024-12-19,53100,2435.93,2,"[美, 삼성전자 보조금 최종 결정…26% 줄어든 47.5억 달러 지급 - 중앙일보,...",0.171850
1225,2024-12-20,53000,2404.15,2,"[美, 삼성전자 보조금 최종 결정…26% 줄어든 47.5억 달러 지급 - 중앙일보,...",0.383987
1226,2024-12-23,53500,2442.01,1,"[삼성전자, 미국 반도체 보조금 7조원 받는다 - 블로터, SK하이닉스, 내년엔 삼...",0.352443


In [136]:
print("\n--- Saving final dataset ---")
df_stock.to_parquet(f"{ticker_code}.parquet")


--- Saving final dataset ---
