## 파일 저장

In [4]:
import pandas as pd

df = pd.read_csv("/Users/JooAnLee/final_project/db/news(23-25)_summarized_external.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14087 entries, 0 to 14086
Data columns (total 87 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   news_id                      14087 non-null  object 
 1   wdate                        14087 non-null  object 
 2   title                        14087 non-null  object 
 3   article                      14087 non-null  object 
 4   press                        14087 non-null  object 
 5   url                          14087 non-null  object 
 6   image                        14087 non-null  object 
 7   article_preprocessed         14087 non-null  object 
 8   summary                      14087 non-null  object 
 9   stock_list                   14087 non-null  object 
 10  industry_list                14087 non-null  object 
 11  summary_embedding            14087 non-null  object 
 12  stock_name                   14087 non-null  object 
 13  ticker          

In [8]:
num_nan_rows = df.isna().any(axis=1).sum()
num_nan_rows

np.int64(628)

In [9]:
df_clean = df.dropna()

In [10]:
df_clean.to_csv("news(23-25)_summarized_external_clean.csv", index=False)


## 외부변수 함수

In [None]:
# 1. 종목명/티커 추출 함수
import ast
def extract_last_company_name(labels):
    if pd.isna(labels) or labels == '[]' or labels == '':
        return None
    if isinstance(labels, str):
        try:
            labels_list = ast.literal_eval(labels)
        except Exception:
            return None
    elif isinstance(labels, list):
        labels_list = labels
    else:
        return None
    if not labels_list:
        return None
    return labels_list[-1]

def get_ticker_from_name(name, ticker_name_map):
    if pd.isna(name):
        return None
    return ticker_name_map.get(name)

In [None]:
# 2. 거래일, 기준일 생성 함수
import numpy as np
import pandas as pd

def make_trading_days(start_year=2022, end_year=2025):
    from pykrx import stock
    trading_days = []
    for year in range(start_year, end_year+1):
        for month in range(1, 13):
            try:
                days = stock.get_business_days(year, month)
                trading_days.extend(days)
            except:
                continue
    trading_days = pd.to_datetime(sorted(set(trading_days)))
    return trading_days

def find_nearest_trading_day(date, trading_days):
    after = trading_days[trading_days >= date]
    if len(after) > 0:
        return after[0]
    else:
        return pd.NaT

def get_trading_day_offsets(d_day, trading_days, offsets):
    result = {}
    if pd.isna(d_day) or d_day not in trading_days.values:
        for col in offsets:
            result[col] = pd.NaT
        return result
    d_idx = np.where(trading_days == d_day)[0][0]
    for col, offset in offsets.items():
        idx = d_idx + offset
        result[col] = trading_days[idx] if 0 <= idx < len(trading_days) else pd.NaT
    return result

def fill_all_trading_dates(row, trading_days, offsets):
    d_day = row['D_day_date']
    dates = get_trading_day_offsets(d_day, trading_days, offsets)
    for col, val in dates.items():
        row[col] = val
    return row

In [None]:
# 3. PyKRX - OHLCV/투자자 데이터 수집
def get_ohlcv_dict(all_tickers, all_dates):
    from pykrx import stock
    ohlcv_dict = {}
    for ticker in all_tickers:
        try:
            ohlcv = stock.get_market_ohlcv_by_date(
                fromdate=min(all_dates),
                todate=max(all_dates),
                ticker=ticker
            )
            ohlcv_dict[ticker] = ohlcv
        except:
            continue
    return ohlcv_dict

def get_trading_dict(all_tickers, all_dates):
    from pykrx import stock
    trading_dict = {}
    for ticker in all_tickers:
        try:
            tv_df = stock.get_market_trading_value_by_date(
                fromdate=min(all_dates),
                todate=max(all_dates),
                ticker=ticker
            )
            trading_dict[ticker] = tv_df
        except:
            continue
    return trading_dict

def get_ohlcv_val(ticker, date, col, ohlcv_dict):
    import numpy as np
    if pd.isna(ticker) or pd.isna(date):
        return np.nan
    date_str = date.strftime('%Y%m%d')
    if ticker not in ohlcv_dict:
        return np.nan
    df_ticker = ohlcv_dict[ticker]
    if date_str not in df_ticker.index:
        return np.nan
    return df_ticker.loc[date_str, col]

def fast_trading_value(ticker, date, investor, trading_dict):
    import numpy as np
    if pd.isna(ticker) or pd.isna(date):
        return np.nan
    date_str = date.strftime('%Y%m%d')
    if ticker not in trading_dict:
        return np.nan
    tv_df = trading_dict[ticker]
    if date_str not in tv_df.index:
        return np.nan
    col_map = {
        '외국인': '외국인합계',
        '기관': '기관합계',
        '개인': '개인'
    }
    return tv_df.loc[date_str, col_map[investor]]

### 한국은행 API

In [None]:
import requests
import pandas as pd
from datetime import timedelta

# 한국은행 ECON API 키
api_key = "BFKGNG1KJWZR9DVAIN6K"
start_date = "20221010"
end_date = "20250530"

def get_bond10y(api_key, start_date, end_date):
    stat_code = "817Y002"
    item_code = "010200000"  # 국고채 10년
    url = f"https://ecos.bok.or.kr/api/StatisticSearch/{api_key}/json/kr/1/1000/{stat_code}/D/{start_date}/{end_date}/{item_code}/"
    response = requests.get(url)
    data = response.json()
    if "StatisticSearch" not in data or "row" not in data["StatisticSearch"]:
        print("국고채 10년 데이터 없음:", data)
        return pd.DataFrame()
    rows = data["StatisticSearch"]["row"]
    df = pd.DataFrame(rows)
    df['date'] = pd.to_datetime(df['TIME'], format='%Y%m%d')
    df['bond10y'] = pd.to_numeric(df['DATA_VALUE'], errors='coerce')
    return df[['date', 'bond10y']]

def get_fx(api_key, start_date, end_date, item_code="0000001"):
    # item_code: 0002 = 미국 달러 (주요국 통화의 대원화 환율)
    stat_code = "731Y001"
    url = f"https://ecos.bok.or.kr/api/StatisticSearch/{api_key}/json/kr/1/1000/{stat_code}/D/{start_date}/{end_date}/{item_code}/"
    response = requests.get(url)
    data = response.json()
    if "StatisticSearch" not in data or "row" not in data["StatisticSearch"]:
        print("환율 데이터 없음:", data)
        return pd.DataFrame()
    rows = data["StatisticSearch"]["row"]
    df = pd.DataFrame(rows)
    df['date'] = pd.to_datetime(df['TIME'], format='%Y%m%d')
    df['usdkrw'] = pd.to_numeric(df['DATA_VALUE'], errors='coerce')
    return df[['date', 'usdkrw']]

df_bond = get_bond10y(api_key, start_date, end_date)
df_fx = get_fx(api_key, start_date, end_date, item_code="0000001")  # 미국 달러

In [None]:
# 4. 외부변수 병합 (환율, 국채10년수익률, 기준금리)
# 환율, 국채10년수익률 - 한국은행 API
# 기준금리 - korea_base_rate_daily.csv
def merge_external_variables(df, df_fx, df_bond, rate_csv_path):
    # 날짜 정렬
    df = df.sort_values('news_date')
    df_fx = df_fx.sort_values('date')
    df_bond = df_bond.sort_values('date')
    # 환율
    df = pd.merge_asof(df, df_fx.rename(columns={'date': 'news_date', 'usdkrw': 'fx'}), on='news_date', direction='backward')
    # 국채10년
    df = pd.merge_asof(df, df_bond.rename(columns={'date': 'news_date'}), on='news_date', direction='backward')
    # 기준금리
    rate_df = pd.read_csv(rate_csv_path) # 기준금리 
    rate_df['date'] = pd.to_datetime(rate_df['date'])
    rate_df = rate_df.sort_values('date')
    df = pd.merge_asof(df, rate_df.rename(columns={'date': 'news_date', 'rate': 'base_rate'}), on='news_date', direction='backward')
    return df

In [None]:
# 5. 전체 실시간 처리 파이프라인
def process_news_row(row, trading_days, offsets, ohlcv_dict, trading_dict):
    # 기준일 채우기
    row = fill_all_trading_dates(row, trading_days, offsets)
    # OHLCV/투자자 데이터 채우기 (예시: D_day_date_close 등)
    for col in offsets.keys():
        row[f'{col}_close'] = get_ohlcv_val(row['ticker'], row[col], '종가', ohlcv_dict)
        row[f'{col}_volume'] = get_ohlcv_val(row['ticker'], row[col], '거래량', ohlcv_dict)
        row[f'{col}_foreign'] = fast_trading_value(row['ticker'], row[col], '외국인', trading_dict)
        row[f'{col}_institution'] = fast_trading_value(row['ticker'], row[col], '기관', trading_dict)
        row[f'{col}_individual'] = fast_trading_value(row['ticker'], row[col], '개인', trading_dict)
    # 등락률등 추가 연산도 여기에
    return row

In [None]:
# 6. 실시간 뉴스 데이터 처리 전체 예시
def process_news_df(df, ticker_name_map, trading_days, offsets, ohlcv_dict, trading_dict, df_fx, df_bond, rate_csv_path):
    # 종목명/티커 추출
    df['stock_name'] = df['stock_list'].apply(extract_last_company_name)
    df['ticker'] = df['stock_name'].apply(lambda x: get_ticker_from_name(x, ticker_name_map))
    df['news_date'] = pd.to_datetime(df['news_date'])
    # D_day_date 생성
    df['D_day_date'] = df['news_date'].apply(lambda x: find_nearest_trading_day(x, trading_days))
    # 기준일별 날짜 생성
    df = df.apply(lambda row: fill_all_trading_dates(row, trading_days, offsets), axis=1)
    # OHLCV/투자자/등락률 등 추가
    df = df.apply(lambda row: process_news_row(row, trading_days, offsets, ohlcv_dict, trading_dict), axis=1)
    # 외부 변수(환율, 국채10년, 기준금리) 병합
    df = merge_external_variables(df, df_fx, df_bond, rate_csv_path)
    return df
