In [1]:
import pandas as pd

df = pd.read_csv('/Users/JooAnLee/final_project/db/news_2023_2025_summarized.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14087 entries, 0 to 14086
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   news_id               14087 non-null  object
 1   wdate                 14087 non-null  object
 2   title                 14087 non-null  object
 3   article               14087 non-null  object
 4   press                 14087 non-null  object
 5   url                   14087 non-null  object
 6   image                 14087 non-null  object
 7   article_preprocessed  14087 non-null  object
 8   summary               14087 non-null  object
 9   stock_list            14087 non-null  object
 10  industry_list         14087 non-null  object
 11  summary_embedding     14087 non-null  object
dtypes: object(12)
memory usage: 1.3+ MB


In [2]:
pip install pykrx

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install --upgrade pykrx

Note: you may need to restart the kernel to use updated packages.


In [4]:
from pykrx import stock
from pykrx import bond
import datetime

In [5]:
import ast

def extract_last_company_name(labels):
    # NaN 또는 빈 값 처리
    if pd.isna(labels) or labels == '[]' or labels == '':
        return None
    # 문자열인 경우 리스트로 변환
    if isinstance(labels, str):
        try:
            labels_list = ast.literal_eval(labels)
        except Exception:
            return None
    elif isinstance(labels, list):
        labels_list = labels
    else:
        return None
    # 빈 리스트 처리
    if not labels_list:
        return None
    # 마지막 값 반환
    return labels_list[-1]

# 적용
df['stock_name'] = df['stock_list'].apply(extract_last_company_name)

In [6]:
# 2. 랜덤 100개 추출 (seed 고정하면 재현 가능)
df_sample = df.sample(n=10, random_state=42).reset_index(drop=True)

In [7]:
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   news_id               10 non-null     object
 1   wdate                 10 non-null     object
 2   title                 10 non-null     object
 3   article               10 non-null     object
 4   press                 10 non-null     object
 5   url                   10 non-null     object
 6   image                 10 non-null     object
 7   article_preprocessed  10 non-null     object
 8   summary               10 non-null     object
 9   stock_list            10 non-null     object
 10  industry_list         10 non-null     object
 11  summary_embedding     10 non-null     object
 12  stock_name            10 non-null     object
dtypes: object(13)
memory usage: 1.1+ KB


In [8]:
import pandas as pd
import ast
from pykrx import stock
from datetime import timedelta
import bisect

# 1. stock_list에서 마지막 ticker 추출
def extract_last_ticker(labels):
    if pd.isna(labels) or labels == '[]' or labels == '':
        return None
    if isinstance(labels, str):
        try:
            labels_list = ast.literal_eval(labels)
        except Exception:
            return None
    elif isinstance(labels, list):
        labels_list = labels
    else:
        return None
    if not labels_list:
        return None
    return labels_list[-1]

df_sample['ticker'] = df_sample['stock_list'].apply(extract_last_ticker)

# 2. 거래일 리스트 생성
def get_trading_days(start_date, end_date):
    trading_days = stock.get_market_ohlcv_by_date(start_date, end_date, "005930").index
    trading_days_str = trading_days.strftime('%Y%m%d')
    return list(trading_days_str)

# 3. target_date가 휴장일이면, 그 다음 거래일 반환
def get_next_trading_day(target_date, trading_days):
    idx = bisect.bisect_left(trading_days, target_date)
    if idx >= len(trading_days):
        return None
    return trading_days[idx]

# 4. pykrx에서 데이터 가져오기
def get_stock_data(ticker, date):
    ohlcv = stock.get_market_ohlcv_by_date(date, date, ticker)
    if ohlcv.empty:
        return None
    close = ohlcv['종가'].iloc[0]
    volume = ohlcv['거래량'].iloc[0]
    trd = stock.get_market_trading_volume_by_investor(date, date, ticker)
    if trd.empty:
        return None
    try:
        foreign = trd.loc['외국인', '순매수']
    except KeyError:
        foreign = None
    try:
        institution = trd.loc['기관합계', '순매수']
    except KeyError:
        institution = None
    try:
        individual = trd.loc['개인', '순매수']
    except KeyError:
        individual = None
    return {
        'close': close,
        'volume': volume,
        'foreign': foreign,
        'institution': institution,
        'individual': individual
    }

# 5. 등락률(D-day)
def get_price_change(ticker, date):
    ohlcv = stock.get_market_ohlcv_by_date(date, date, ticker)
    if ohlcv.empty:
        return None
    open_ = ohlcv['시가'].iloc[0]
    close = ohlcv['종가'].iloc[0]
    if open_ == 0:
        return None
    change = ((close / open_) - 1) * 100
    return change

# 6. 날짜 offset/필드 정의
date_offsets = [-14, -7, -3, -2, -1, 0, 1, 2, 3, 7, 14]
fields = ['close', 'volume', 'foreign', 'institution', 'individual']
for d in date_offsets:
    for f in fields:
        df_sample[f'D{d:+}_{f}'] = None
df_sample['D-day_change'] = None

# 7. 거래일 리스트 미리 생성
all_dates = pd.to_datetime(df_sample['wdate'])
min_date = (all_dates.min() - pd.Timedelta(days=20)).strftime('%Y%m%d')
max_date = (all_dates.max() + pd.Timedelta(days=20)).strftime('%Y%m%d')
trading_days = get_trading_days(min_date, max_date)

# 8. 데이터 수집
for idx, row in df_sample.iterrows():
    wdate = pd.to_datetime(row['wdate'])
    ticker = row['ticker']
    if pd.isna(ticker) or pd.isna(wdate):
        continue
    for d in date_offsets:
        target_date = wdate + pd.Timedelta(days=d)
        if pd.isna(target_date):
            continue
        target_date_str = target_date.strftime('%Y%m%d')
        trading_date = get_next_trading_day(target_date_str, trading_days)
        if trading_date is None:
            continue
        data = get_stock_data(ticker, trading_date)
        # 휴장 등 데이터 없는 경우, 다음 거래일까지 반복해서 찾음
        next_idx = bisect.bisect_left(trading_days, trading_date)
        while data is None and next_idx + 1 < len(trading_days):
            next_idx += 1
            trading_date = trading_days[next_idx]
            data = get_stock_data(ticker, trading_date)
        if data is None:
            continue
        for f in fields:
            df_sample.at[idx, f'D{d:+}_{f}'] = data[f]
    # D-day 등락률
    dday_date = get_next_trading_day(wdate.strftime('%Y%m%d'), trading_days)
    if dday_date is not None:
        change = get_price_change(ticker, dday_date)
        next_idx = bisect.bisect_left(trading_days, dday_date)
        while change is None and next_idx + 1 < len(trading_days):
            next_idx += 1
            dday_date = trading_days[next_idx]
            change = get_price_change(ticker, dday_date)
        df_sample.at[idx, 'D-day_change'] = change

# 9. 결과 확인
print(df_sample.head())


         news_id                wdate  \
0  20250325_0180  2025-03-25 05:40:00   
1  20240802_0119  2024-08-02 10:00:00   
2  20240708_0187  2024-07-08 09:21:00   
3  20240229_0285  2024-02-29 08:17:00   
4  20250423_0040  2025-04-23 14:52:00   

                                       title  \
0  이른 더위에 비빔면 마케팅 후끈.. 1위 팔도 쫓는 농심·오뚜기 추격 거세   
1        대명소노시즌, 티웨이항공 지분 매입에 ‘上’…경영권 분쟁 가능성   
2          "12만전자 간다"…삼성전자 장중 8.8만원 '신고가' 터치   
3       유니켐, 지난해 매출액 1083억 달성…"자동차용 시트사업 호조"   
4              유안타증권 'EDC KOREA 2025' 스폰서 참여   

                                             article  press  \
0  /사진제공=팔도\n이달부터 이른 더위가 시작되면서 라면업계의 비빔면 경쟁에도 불이 ...  머니투데이   
1  대명소노시즌이 상한가로 직행했다. 대명소노그룹이 티웨이항공 지분을 추가 확보하면서 ...   매일경제   
2  [특징주]\n서울 서초구 삼성전자 서초사옥에 로고가 보이고 있다./사진=뉴시스.\n...  머니투데이   
3  피혁 전문기업 유니켐은 지난해 연결 기준 매출액 1083억원, 영업이익 26억원을 ...  아시아경제   
4  사진=유안타증권\n유안타증권은 오는 25~26일 이틀간 영종도 인스파이어 리조트에서...   한국경제   

                                                 url  \
0  https://n.news.naver.com/m