In [121]:
import pandas as pd
import numpy as np

file_path = "./stooq/A.US.csv"
df = pd.read_csv(file_path)

# 수익률 계산: 종가 기준으로 % 계산
df['Return'] = df['Close'].pct_change() * 100
df = df.dropna(subset=['Return'])
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Return
1,1999-11-19,27.8972,27.9371,25.8613,26.2311,16773580.0,-8.237307
2,1999-11-22,26.837,28.5858,26.0278,28.5858,7242576.0,8.976749
3,1999-11-23,27.6102,28.3377,25.9889,25.9889,6579458.0,-9.08458
4,1999-11-24,26.0637,27.2445,25.9889,26.6745,5332648.0,2.638049
5,1999-11-26,26.5569,26.9585,26.4752,26.7622,1904229.0,0.328778


In [122]:
from scipy.stats.mstats import winsorize

# Winsorization 적용
df['Return_winsorized'] = winsorize(df['Return'], limits=[0.01, 0.01])
df[['Date', 'Return', 'Return_winsorized']].head(10)

Unnamed: 0,Date,Return,Return_winsorized
1,1999-11-19,-8.237307,-7.175549
2,1999-11-22,8.976749,7.946328
3,1999-11-23,-9.08458,-7.175549
4,1999-11-24,2.638049,2.638049
5,1999-11-26,0.328778,0.328778
6,1999-11-29,2.244957,2.244957
7,1999-11-30,0.175419,0.175419
8,1999-12-01,1.77374,1.77374
9,1999-12-02,2.75404,2.75404
10,1999-12-03,0.848407,0.848407


In [None]:
# 최대,최소 몇 % 인지 (논문에서 보통 -100% 안넘는다 함)
print(df['Return'].min(), df['Return'].max())

-27.08045384540241 47.18068071848707


In [127]:
import math
math.ceil(200.01)

201

In [129]:
import pandas as pd
import math

file_path = "./stooq/A.US.csv"
df = pd.read_csv(file_path)

# 수익률 계산
df['Return'] = df['Close'].pct_change()

def discretize_return(r):
    """
    단일 수익률 r (소수 형태, 예: -0.024는 -2.4%)를 토큰 인덱스 (0 ~ 401)로 변환
    
    변환 규칙:
      - r를 10,000배하여 basis point 정수값으로 변환 (r_bp)
      - r_bp <= -10000  → 토큰 0
      - r_bp > 10000    → 토큰 401
      - 그 외: 토큰 = ceil((r_bp + 10000) / 50)
              (이때 토큰은 1부터 400까지 할당)
    """
    r_bp = int(r * 10000)
    if r_bp <= -10000:
        return 0
    elif r_bp > 10000:
        return 401
    else:
        return math.ceil((r_bp + 10000) / 50)

# Return 열의 각 값에 대해 토큰 인덱스 계산 (첫 행은 NaN이므로 처리)
df = df.dropna(subset=['Return'])
df['ReturnToken'] = df['Return'].apply(lambda x: discretize_return(x))

output_file_path = "./data/A.US_with_returns_tokens.csv"
df.to_csv(output_file_path, index=False)

print(df.head(10))


          Date     Open     High      Low    Close        Volume    Return  \
1   1999-11-19  27.8972  27.9371  25.8613  26.2311  1.677358e+07 -0.082373   
2   1999-11-22  26.8370  28.5858  26.0278  28.5858  7.242576e+06  0.089767   
3   1999-11-23  27.6102  28.3377  25.9889  25.9889  6.579458e+06 -0.090846   
4   1999-11-24  26.0637  27.2445  25.9889  26.6745  5.332648e+06  0.026380   
5   1999-11-26  26.5569  26.9585  26.4752  26.7622  1.904229e+06  0.003288   
6   1999-11-29  26.6356  27.5724  26.3506  27.3630  4.486512e+06  0.022450   
7   1999-11-30  27.2844  27.8972  26.5958  27.4110  4.745571e+06  0.001754   
8   1999-12-01  27.4110  28.2221  27.2096  27.8972  3.256172e+06  0.017737   
9   1999-12-02  28.4204  29.2325  28.0596  28.6655  3.380083e+06  0.027540   
10  1999-12-03  29.1957  29.6819  28.7851  28.9087  3.348990e+06  0.008484   

    ReturnToken  
1           184  
2           218  
3           182  
4           206  
5           201  
6           205  
7           201

In [None]:
# 논문에 나온 예시 테스트
sample = [-0.024, 0, 0, 0.05, 0.048]
sample = [discretize_return(x) for x in sample]
sample

[196, 200, 200, 210, 210]

In [145]:
import os
import glob

def process_csv_file(file_path, output_dir):
    """
    파일을 읽어서 일별 수익률과 토큰 인덱스를 계산한 후, 결과를 output_dir에 저장합니다.
    """
    # CSV 파일 로드
    try:
        df = pd.read_csv(file_path)
    except pd.errors.EmptyDataError:
        print(f"Skipping file {file_path}: File is empty.")
        return

    # 'Close' 혹은 'close' 열 확인
    if 'Close' in df.columns:
        close_col = 'Close'
    else:
        print(f"Skipping file {file_path} because it does not have a 'Close' column.")
        return  # 해당 파일은 처리하지 않고 건너뜁니다.
    
    # Close 기준 일별 수익률 계산: (오늘의 Close / 어제의 Close) - 1
    df['Return'] = df[close_col].pct_change()
    
    df = df.dropna(subset=['Return'])
    df['ReturnToken'] = df['Return'].apply(lambda x: discretize_return(x))
    
    # 원본 파일 이름에 접미사를 붙여 결과 파일 이름 생성
    base_name = os.path.basename(file_path)
    output_file = os.path.join(output_dir, base_name.replace('.csv', '_with_returns_tokens.csv'))
    
    # 결과 DataFrame을 CSV 파일로 저장 (index는 제외)
    df.to_csv(output_file, index=False)
    print(f"Processed: {file_path} -> {output_file}")

In [146]:
input_dir = "./stooq"
output_dir = "./data"

# 출력 폴더가 없으면 생성
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 입력 폴더 내의 모든 CSV 파일 목록을 가져옴
csv_files = glob.glob(os.path.join(input_dir, "*.csv"))

# 각 파일에 대해 처리 수행
for file_path in csv_files:
    process_csv_file(file_path, output_dir)

Skipping file ./stooq/IAUG.US.csv because it does not have a 'Close' column.
Skipping file ./stooq/GMGI.US.csv because it does not have a 'Close' column.
Skipping file ./stooq/AVA.US.csv because it does not have a 'Close' column.
Skipping file ./stooq/MGRM.US.csv because it does not have a 'Close' column.
Skipping file ./stooq/RDDT.US.csv: File is empty.
Skipping file ./stooq/NTNX.US.csv: File is empty.
Skipping file ./stooq/ARR.US.csv because it does not have a 'Close' column.
Processed: ./stooq/AOTG.US.csv -> ./data/AOTG.US_with_returns_tokens.csv
Skipping file ./stooq/CNNE.US.csv because it does not have a 'Close' column.
Skipping file ./stooq/BYU.US.csv because it does not have a 'Close' column.
Skipping file ./stooq/HBANM.US.csv because it does not have a 'Close' column.
Skipping file ./stooq/ETNB.US.csv because it does not have a 'Close' column.
Skipping file ./stooq/VSCO.US.csv: File is empty.
Skipping file ./stooq/EXG.US.csv because it does not have a 'Close' column.
Skipping f