In [1]:
import pickle
import pandas as pd
import numpy as np
from yahooquery import Ticker
from dateutil.relativedelta import relativedelta

In [3]:
df = pd.read_pickle("Data/10Q_tokenized.pkl")

In [4]:
df['price_today'] = None
df['price_3mo'] = None
df['tickers'] = df['tickers'].apply(lambda x: x[0] if isinstance(x, list) else x)
df['tickers'] = df['tickers'].astype(str).str.strip().str.upper()

df = df[df.filingDate < "2025-08-01"]
df['filingDate'] = pd.to_datetime(df['filingDate'])
df.head()

Unnamed: 0,tickers,companyName,accessionNumber,document,filingDate,url,content,price_today,price_3mo
1,MMM,3M CO,0000066740-25-000063,mmm-20250630.htm,2025-07-18,https://www.sec.gov/Archives/edgar/data/000006...,"[item, managements, discussion, analysis, fina...",,
2,MMM,3M CO,0000066740-25-000039,mmm-20250331.htm,2025-04-22,https://www.sec.gov/Archives/edgar/data/000006...,"[item, managements, discussion, analysis, fina...",,
3,MMM,3M CO,0000066740-24-000101,mmm-20240930.htm,2024-10-22,https://www.sec.gov/Archives/edgar/data/000006...,"[item, managements, discussion, analysis, fina...",,
4,MMM,3M CO,0000066740-24-000080,mmm-20240630.htm,2024-07-26,https://www.sec.gov/Archives/edgar/data/000006...,"[item, managements, discussion, analysis, fina...",,
5,MMM,3M CO,0000066740-24-000053,mmm-20240331.htm,2024-04-30,https://www.sec.gov/Archives/edgar/data/000006...,"[item, managements, discussion, analysis, fina...",,


In [6]:
def get_nearest_date(target_date, available_dates):
    """Find the nearest available date in the index."""
    return available_dates[np.argmin(np.abs(available_dates - target_date))]

In [7]:
for ticker, group in df.groupby('tickers'):
    start_date = group['filingDate'].min() - pd.Timedelta(days=5)
    end_date = group['filingDate'].max() + relativedelta(months=3) + pd.Timedelta(days=5)

    t = Ticker(ticker)
    hist = t.history(start=start_date, end=end_date)

    # Handle multi-index (typical with yahooquery)
    if isinstance(hist.index, pd.MultiIndex):
        hist = hist.loc[ticker]

    # Ensure index is datetime
    hist.index = pd.to_datetime(hist.index)

    for idx, row in group.iterrows():
        date = row['filingDate']
        future_date = date + relativedelta(months=3)

        if len(hist) == 0:
            df.at[idx, 'price_today'] = None
            df.at[idx, 'price_3mo'] = None
            continue

        nearest_today = get_nearest_date(date, hist.index)
        nearest_future = get_nearest_date(future_date, hist.index)

        df.at[idx, 'price_today'] = hist.loc[nearest_today, 'close']
        df.at[idx, 'price_3mo'] = hist.loc[nearest_future, 'close']

df.head()

Unnamed: 0,tickers,companyName,accessionNumber,document,filingDate,url,content,price_today,price_3mo
1,MMM,3M CO,0000066740-25-000063,mmm-20250630.htm,2025-07-18,https://www.sec.gov/Archives/edgar/data/000006...,"[item, managements, discussion, analysis, fina...",153.229996,152.639999
2,MMM,3M CO,0000066740-25-000039,mmm-20250331.htm,2025-04-22,https://www.sec.gov/Archives/edgar/data/000006...,"[item, managements, discussion, analysis, fina...",136.330002,151.199997
3,MMM,3M CO,0000066740-24-000101,mmm-20240930.htm,2024-10-22,https://www.sec.gov/Archives/edgar/data/000006...,"[item, managements, discussion, analysis, fina...",131.729996,149.119995
4,MMM,3M CO,0000066740-24-000080,mmm-20240630.htm,2024-07-26,https://www.sec.gov/Archives/edgar/data/000006...,"[item, managements, discussion, analysis, fina...",127.160004,124.75
5,MMM,3M CO,0000066740-24-000053,mmm-20240331.htm,2024-04-30,https://www.sec.gov/Archives/edgar/data/000006...,"[item, managements, discussion, analysis, fina...",96.510002,126.75


In [23]:
df['price_change'] = (df.price_3mo - df.price_today)/df.price_today
df = df[df.price_change !=0]
df['direction'] = (df.price_change > 0).astype(int)

In [21]:
df.to_pickle("Data/10Q_tokenized_prices.pkl")