# Init

In [25]:
# List of tickers for Korean stocks
tickers = {
    "삼성전자": "005930", "SK": "034730", "한화": "000880",
    "두산": "000150", "기아": "000270", "현대차": "005380",
    "LG": "003550", "NAVER": "035420", "카카오": "035720", "롯데지주": "004990"
}

# Date range for the stock data
start_date = "20200101"
end_date = "20250101"

In [26]:
# Target ticker for analysis
# TARGET_TICKER = "삼성전자"
TARGET_TICKER = list(tickers.keys())[0]
ticker_code = tickers[TARGET_TICKER]
TARGET_TICKER, ticker_code, start_date, end_date

('삼성전자', '005930', '20200101', '20250101')

In [27]:
import numpy as np
import torch

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x1cd21c99a90>

In [28]:
# Load the train, val, and test sets (list of DataFrames) from parquet files
import os
import json
import pandas as pd

dataset_dir = f"{ticker_code}_dataset"
train = []
val = []
test = []

if os.path.exists(dataset_dir):
    print(f"Loading dataset from {dataset_dir}/ directory...")
    
    # Load metadata
    with open(f"{dataset_dir}/metadata.json", 'r') as f:
        metadata = json.load(f)
    
    SEQ_LENGTH = metadata["seq_length"]
    print(f"Sequence length: {SEQ_LENGTH}")
    print(f"Ticker: {metadata['target_ticker']} ({metadata['ticker_code']})")
    assert SEQ_LENGTH > 0, "Sequence length must be greater than 0"
    assert ticker_code == metadata['ticker_code'], "Ticker code mismatch in metadata"
    assert TARGET_TICKER == metadata['target_ticker'], "Target ticker mismatch in metadata"
    
    # Load and reconstruct train set
    train_combined = pd.read_parquet(f"{dataset_dir}/train.parquet")
    for window_id in range(metadata["train_windows"]):
        window_df = train_combined[train_combined['window_id'] == window_id].drop('window_id', axis=1).reset_index(drop=True)
        train.append(window_df)
    
    # Load and reconstruct validation set
    val_combined = pd.read_parquet(f"{dataset_dir}/val.parquet")
    for window_id in range(metadata["val_windows"]):
        window_df = val_combined[val_combined['window_id'] == window_id].drop('window_id', axis=1).reset_index(drop=True)
        val.append(window_df)
    
    # Load and reconstruct test set
    test_combined = pd.read_parquet(f"{dataset_dir}/test.parquet")
    for window_id in range(metadata["test_windows"]):
        window_df = test_combined[test_combined['window_id'] == window_id].drop('window_id', axis=1).reset_index(drop=True)
        test.append(window_df)
    
    print(f"Loaded - Train windows: {len(train)}, Val windows: {len(val)}, Test windows: {len(test)}")
    print("Dataset loaded successfully!")
else:
    print(f"Dataset directory {dataset_dir} not found. Please run the dataset creation cells first.")


Loading dataset from 005930_dataset/ directory...
Sequence length: 10
Ticker: 삼성전자 (005930)
Loaded - Train windows: 727, Val windows: 236, Test windows: 236
Dataset loaded successfully!


In [29]:
train[0]["target"]

0    55500.0
1    55800.0
2    56800.0
3    58600.0
4    59500.0
5    60000.0
6    60000.0
7    59000.0
8    60700.0
9    61300.0
Name: target, dtype: float64

In [None]:
FORECAST_LENGTH = 1
train_combined = []

for train_df in train:
    last_target = train_df["target"].iloc[-1]

    # Make prompt with the train_df
    datas = train_df.drop(columns=["target"]).to_dict(orient='records')
    datas_str = ""
    for data in datas:
        datas_str += str(data["date"]).split(" ")[0] + "\t"
        for i, (k, v) in enumerate(data.items()):
            # v = round(v, 2) if isinstance(v, (int, float)) else str(v)
            data[k] = round(v, 2) if isinstance(v, (int, float)) else str(v)
        datas_str += "\t".join([f"{k}: {str(v).replace('\n', '')}" for k, v in data.items() if k != "date"]) + "\n"
    datas_str = datas_str.strip()
    prompt_str = f"""Here is the Stock data of {TARGET_TICKER} ({ticker_code}) for the past {SEQ_LENGTH} days:
I will now give you data for the past {SEQ_LENGTH} recorded dates, and please help me forecast the data for next {FORECAST_LENGTH} recorded dates. The data is as follows:
```
{datas_str}
```
Please give me the close data for the next {FORECAST_LENGTH} recorded dates, remember to give me the close data. 
You must first conduct reasoning inside <think> …</think>. 
When you have the final answer, you can output the answer inside <answer>…</answer> and the reason of the answer inside <reason>…</reason>.

Example output:
```
<think>...</think>
<answer>20XX-XX-XX\tclose: XXXXX
20XX-XX-XX\tclose: XXXXX
... (continue for {FORECAST_LENGTH} days)</answer>
<reason>...</reason>
```
"""

    train_combined.append({
        "prompt": prompt_str,
        "target": int(last_target),
    })

# Convert to DataFrame
train_combined_pd = pd.DataFrame(train_combined)
train_combined_pd

Unnamed: 0,prompt,target
0,Here is the Stock data of 삼성전자 (005930) for th...,61300
1,Here is the Stock data of 삼성전자 (005930) for th...,62400
2,Here is the Stock data of 삼성전자 (005930) for th...,61400
3,Here is the Stock data of 삼성전자 (005930) for th...,62300
4,Here is the Stock data of 삼성전자 (005930) for th...,60800
...,...,...
722,Here is the Stock data of 삼성전자 (005930) for th...,59500
723,Here is the Stock data of 삼성전자 (005930) for th...,58600
724,Here is the Stock data of 삼성전자 (005930) for th...,58000
725,Here is the Stock data of 삼성전자 (005930) for th...,59100


In [None]:
train_combined_pd.to_parquet(f"{dataset_dir}/train_combined.parquet", index=False)

In [None]:
train_combined_pd = pd.read_parquet(f"{dataset_dir}/train_combined.parquet")
train_combined_pd

Unnamed: 0,prompt,target
0,Here is the Stock data of 삼성전자 (005930) for th...,61300
1,Here is the Stock data of 삼성전자 (005930) for th...,62400
2,Here is the Stock data of 삼성전자 (005930) for th...,61400
3,Here is the Stock data of 삼성전자 (005930) for th...,62300
4,Here is the Stock data of 삼성전자 (005930) for th...,60800
...,...,...
722,Here is the Stock data of 삼성전자 (005930) for th...,59500
723,Here is the Stock data of 삼성전자 (005930) for th...,58600
724,Here is the Stock data of 삼성전자 (005930) for th...,58000
725,Here is the Stock data of 삼성전자 (005930) for th...,59100


In [None]:
from openai import OpenAI
client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key="sk-or-v1-CENSORED"  # Replace with your actual API key,
)

In [45]:
import re 

def get_forecast(prompt):
    completion = client.chat.completions.create(
      model="deepseek/deepseek-r1-0528:free",
      messages=[
        {
          "role": "user",
          "content": prompt
        }
      ]
    )
    result = str(completion.choices[0].message.content)
    # Extract the <reason>...</reason> content
    reason_match = re.search(r'<reason>(.*?)</reason>', result, re.DOTALL)
    if not reason_match:
        raise ValueError(f"No valid <reason> section found in the response: {result}")
    reason_content = reason_match.group(1).strip()
    
    # Extract <answer>...</answer> content
    answer_pattern = r'<answer>(.*?)</answer>'
    answer_match = re.search(answer_pattern, result, re.DOTALL)
    if not answer_match:
        raise ValueError(f"No valid <answer> section found in the response: {result}")
    
    answer = answer_match.group(1).strip()
    lines = answer.split('\n')
    forecast = {}
    for line in lines:
        parts = line.split('\t')
        if len(parts) >= 2:
            date = parts[0].strip()
            close_value = parts[1].split(':')[1].strip()
            forecast[date] = float(close_value)

    return reason_content, forecast

In [None]:
print(train_combined_pd.iloc[0]["prompt"])

Here is the Stock data of 삼성전자 (005930) for the past 10 days:
I will now give you data for the past 10 recorded dates, and please help me forecast the data for next 1 recorded dates. The data is as follows:
```
2020-01-03	open: 56000	high: 56600	low: 54900	close: 55500	volume: 15422255	BPS: 35342	PER: 8.59	PBR: 1.57	EPS: 6461	DIV: 2.55	DPS: 1416	institution: -123332624300	other_corporation: -5130936300	individual: 66485169500	foreign: 61978391100	institution_volume: -2228329	other_corporation_volume: -91483	individual_volume: 1199681	foreign_volume: 1120131	shorting_volume: 218704	buy_volume: 15422255	shorting_ratio: 1.42	kospi_close: 2176.46	exchange_rate: 1165.15	us_10y_yield: 1.8	news: ['삼성전자, CES2020서 게이밍 모니터 ‘오디세이’ 신모델 첫 공개 - Samsung Newsroom' "삼성전자, 최고 기술 전문가 '삼성명장' 선발 - 전자부품 전문 미디어 디일렉" '삼성전자, 새로운 콘셉트의 생활가전 선보인다 - 인더스트리뉴스' "삼성전자, 'CES 2020'서 큐브 냉장고 등 新라이프스타일 가전 공개 - 서울파이낸스" "삼성 '진짜 베젤 제로'···QLED TV 테두리를 완전히 없앴다 - 중앙일보" '삼성전자, 태블릿노트북 ‘갤럭시북플렉스알파’를 90만 원대 미국 출시 - 비즈니스포스트' "삼성전자 '큐브

: 

In [None]:
# Example usage
reason_content, forecast = get_forecast(train_combined_pd.iloc[0]["prompt"])
forecast, int(train_combined_pd.iloc[0]["target"])

({'2020-01-17': 61200.0}, 61300)

In [None]:
import requests

def get_ticker(company_name):
    yfinance = "https://query2.finance.yahoo.com/v1/finance/search"
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    params = {"q": company_name, "quotes_count": 1, "country": "United States"}

    res = requests.get(url=yfinance, params=params, headers={'User-Agent': user_agent})
    data = res.json()
    company_code = data['quotes'][0]['symbol']
    return company_code

In [None]:
get_ticker("Samsung Electronics")