In [2]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
from pandas.tseries.offsets import MonthEnd

In [3]:
meta = pd.read_excel('ews_nb_0626.xlsx', sheet_name='meta', index_col='Data ID')

In [4]:
latest = pd.read_csv('latest_vintage.csv', index_col=0)
latest.index = pd.to_datetime(latest.index)
# 인덱스를 월말로 변경하고 빈도 설정
latest.index = latest.index + MonthEnd(0)
latest.index.freq = 'M'  # 월말 빈도 설정

In [5]:
features = meta[meta['Feature']=='Y']

In [6]:
# Generate all mondays between 2002-01-01 and 2025-06-30
mondays = pd.date_range(start='2002-01-01', end='2025-06-23', freq='W-MON')

In [7]:
# Helper to get nth Monday of month
def get_nth_weekday_of_month(year, month, n, weekday=0):
    n = int(n)
    first_day = datetime(year, month, 1)
    days_offset = (weekday - first_day.weekday() + 7) % 7
    first_desired = first_day + timedelta(days=days_offset)
    return first_desired + timedelta(weeks=n-1)

In [8]:
output_dir = 'vintages'
os.makedirs(output_dir, exist_ok=True)

In [10]:
for today in mondays:
    # 이 빈티지에 포함할 과거 월말 전체 리스트
    all_periods = pd.period_range(latest.index[0], today.to_period('M'), freq='M')
    df = pd.DataFrame(index=all_periods.to_timestamp('M'), columns=features.index)
    
    for var in features.index:
        lag  = int(features.at[var, 'Delay'])
        week = int(features.at[var, 'Week'])
        
        for pe in all_periods:  # pe: Period('YYYY-MM')
            period_end = pe.to_timestamp('M')  # 월말 datetime
            if period_end not in latest.index:
                # latest에 월말 데이터가 없으면 무조건 NaN
                df.at[period_end, var] = np.nan
                continue
            
            if week == 0:
                # daily: 언제든 해당 월말 값 사용
                df.at[period_end, var] = latest.at[period_end, var]
            else:
                # monthly/quarterly: pub_month = pe + lag
                pub_month = pe + lag
                pub_date  = get_nth_weekday_of_month(pub_month.year, pub_month.month, week)
                # 빈티지 시점(today)이 pub_date 이후면 값 반영
                if today >= pd.Timestamp(pub_date):
                    df.at[period_end, var] = latest.at[period_end, var]
                else:
                    df.at[period_end, var] = np.nan
    
    # 파일로 저장 (ex: vintages/2008-04-07.csv)
    fname = os.path.join(output_dir, today.strftime('%Y-%m-%d') + '.csv')
    df.to_csv(fname)