In [16]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
import requests

In [17]:
def fetch_market_indicators(start_date: datetime, end_date=None) -> pd.DataFrame:
    """
    Fetch market relationship indicators from Yahoo Finance.
    
    Parameters:
    start_date (str): Start date in 'YYYY-MM-DD' format
    end_date (str): End date in 'YYYY-MM-DD' format, defaults to today if None
    
    Returns:
    pandas.DataFrame: DataFrame containing market indicators
    """
    # If no end date specified, use today
    if end_date is None:
        end_date = datetime.today().strftime('%Y-%m-%d')
    
    # Define the symbols to fetch
    symbols = {
        # Original symbols
        'GC=F': 'Gold',
        'JPY=X': 'USD/JPY',
        '^FVX': '5-Year Treasury',
        '^GSPC': 'S&P 500',
        '^TYX': '30-Year Treasury',
        '^VIX': 'VIX',
        
        # New additions
        '^AXJO': 'ASX 200 (Australia)',
        '^FTSE': 'FTSE 100 (UK)',
        '^GDAXI': 'DAX (Germany)',
        'EUR=X': 'EUR/USD',
        'CL=F': 'Crude Oil',
        '^TNX': '10-Year Treasury Yield',
        '^IRX': '13-Week Treasury Yield',
        'SPY': 'S&P 500 ETF',
        '^NYHILO': 'NYSE New High/Low Index',
        
        # Additional bond ETFs
        'TLT': '20+ Year Treasury Bond ETF',
        # 'IEF': '7-10 Year Treasury Bond ETF',
        # 'SHY': '1-3 Year Treasury Bond ETF',
        'LQD': 'Investment Grade Corporate Bond ETF',
        # 'HYG': 'High Yield Corporate Bond ETF',
        
        # Additional currency pairs
        # 'GBPUSD=X': 'GBP/USD',
        # 'AUDUSD=X': 'AUD/USD',
        # 'CADUSD=X': 'CAD/USD',
        
        # Additional commodities
        # 'SI=F': 'Silver',
        'HG=F': 'Copper',
        # 'NG=F': 'Natural Gas'
    }
    
    # Initialize an empty dictionary to store the data
    data_dict = {}
    
    # Fetch data for each symbol
    for symbol, description in symbols.items():
        try:
            ticker = yf.Ticker(symbol)
            df = ticker.history(start=start_date, end=end_date)
            
            if not df.empty:
                # We'll use the adjusted closing price
                data_dict[symbol] = df['Close']
                print(f"Successfully fetched data for {symbols[symbol]} ({symbol})")
            else:
                print(f"No data available for {symbols[symbol]} ({symbol})")
        except Exception as e:
            print(f"Error fetching data for {symbol}: {str(e)}")
    
    # Combine all series into a single DataFrame
    combined_df = pd.DataFrame(data_dict)
    
    return combined_df

In [18]:
def generate_summary_stats(df):
    """
    Generate summary statistics for the dataset.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame
    
    Returns:
    pandas.DataFrame: Summary statistics
    """
    stats = pd.DataFrame({
        'start_date': df.index.min(),
        'end_date': df.index.max(),
        'data_points': df.count(),
        'missing_pct': (df.isna().sum() / len(df) * 100).round(2),
        'mean': df.mean().round(4),
        'std': df.std().round(4),
        'min': df.min().round(4),
        'max': df.max().round(4)
    }).T
    
    return stats

In [19]:
def fill_missing_data(df):
    """
    Fill missing data in financial time series using appropriate methods
    for different types of instruments.
    
    Parameters:
    df (pd.DataFrame): DataFrame with financial instruments and timestamp column 'Date'
    
    Returns:
    pd.DataFrame: Filled DataFrame
    """
    # Create a copy to avoid modifying the original
    df_filled = df.copy()
    
    # Convert timestamp to datetime
    df_filled['datetime'] = pd.to_datetime(df_filled['Date'], unit='ms')
    df_filled = df_filled.set_index('datetime')
    
    # Group columns by type
    market_indices = ['^GSPC', '^FTSE', '^GDAXI', '^AXJO']
    currencies = ['JPY=X', 'EUR=X']
    commodities = ['GC=F', 'CL=F', 'HG=F']
    rates = ['^FVX', '^TYX', '^TNX', '^IRX']
    etfs = ['SPY', 'TLT', 'LQD']
    
    # Fill market indices during their trading hours
    for idx in market_indices:
        df_filled[idx] = df_filled[idx].fillna(method='ffill', limit=8)
    
    # Interpolate currencies with time-weighted values
    for curr in currencies:
        df_filled[curr] = df_filled[curr].interpolate(method='time', limit=4)
    
    # Forward fill commodities but reset at day boundaries
    for comm in commodities:
        df_filled[comm] = df_filled.groupby(df_filled.index.date)[comm].fillna(method='ffill')
    
    # Interpolate rates linearly within same trading day
    for rate in rates:
        df_filled[rate] = df_filled.groupby(df_filled.index.date)[rate].apply(
            lambda x: x.interpolate(method='linear', limit=4)
        )
    
    # Forward fill ETFs similar to their underlying indices
    for etf in etfs:
        df_filled[etf] = df_filled[etf].fillna(method='ffill', limit=8)
    
    # Special handling for VIX - use ffill with shorter window
    df_filled['^VIX'] = df_filled['^VIX'].fillna(method='ffill', limit=4)
    
    # Keep the original Date column
    df_filled['Date'] = df['Date']
    
    return df_filled

In [20]:
# Fetch the data
market_indicators_data = fetch_market_indicators(datetime.now() -  timedelta(days=1 * 365))
# fs = FeatureStorage('./Data/market_indicators.parquet')
# fs.save_features(market_indicators_data)


Successfully fetched data for Gold (GC=F)
Successfully fetched data for USD/JPY (JPY=X)
Successfully fetched data for 5-Year Treasury (^FVX)
Successfully fetched data for S&P 500 (^GSPC)
Successfully fetched data for 30-Year Treasury (^TYX)
Successfully fetched data for VIX (^VIX)
Successfully fetched data for ASX 200 (Australia) (^AXJO)
Successfully fetched data for FTSE 100 (UK) (^FTSE)
Successfully fetched data for DAX (Germany) (^GDAXI)
Successfully fetched data for EUR/USD (EUR=X)
Successfully fetched data for Crude Oil (CL=F)
Successfully fetched data for 10-Year Treasury Yield (^TNX)


$^NYHILO: possibly delisted; no timezone found


Successfully fetched data for 13-Week Treasury Yield (^IRX)
Successfully fetched data for S&P 500 ETF (SPY)
No data available for NYSE New High/Low Index (^NYHILO)
Successfully fetched data for 20+ Year Treasury Bond ETF (TLT)
Successfully fetched data for Investment Grade Corporate Bond ETF (LQD)
Successfully fetched data for Copper (HG=F)


In [21]:
market_indicators_data

Unnamed: 0_level_0,GC=F,JPY=X,^FVX,^GSPC,^TYX,^VIX,^AXJO,^FTSE,^GDAXI,EUR=X,CL=F,^TNX,^IRX,SPY,TLT,LQD,HG=F
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2023-12-25 00:00:00+00:00,,142.341995,,,,,,,,0.90690,,,,,,,
2023-12-26 00:00:00+00:00,,142.229996,,,,,,,,0.90742,,,,,,,
2023-12-26 05:00:00+00:00,2058.199951,,,4774.750000,,,,,,,75.570000,,,469.625946,94.863472,105.614861,3.8955
2023-12-26 06:00:00+00:00,,,3.875,,4.043,12.990000,,,,,,3.886,5.203,,,,
2023-12-26 13:00:00+00:00,,,,,,,7561.200195,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-19 06:00:00+00:00,,,4.425,,4.740,24.090000,,,,,,4.570,4.220,,,,
2024-12-19 23:00:00+00:00,,,,,,,,,19884.75,,,,,,,,
2024-12-20 00:00:00+00:00,,157.643997,,,,,,8084.600098,,0.96479,,,,,,,
2024-12-20 05:00:00+00:00,2640.500000,,,5930.850098,,,,,,,69.580002,,,591.150024,88.309998,106.980003,4.0985


In [22]:
# Generate and print summary statistics
summary_stats = generate_summary_stats(market_indicators_data)
print("\nSummary Statistics:")
print(summary_stats)
    
print("\nColumns in market indicators dataset:")
for col in market_indicators_data.columns:
    print(f"- {col}")


Summary Statistics:
                                  GC=F                      JPY=X  \
start_date   2023-12-25 00:00:00+00:00  2023-12-25 00:00:00+00:00   
end_date     2024-12-20 06:00:00+00:00  2024-12-20 06:00:00+00:00   
data_points                        250                        260   
missing_pct                      80.24                      79.45   
mean                         2379.3848                   151.0693   
std                           225.3726                     5.1156   
min                             1990.3                     140.79   
max                             2788.5                    161.621   

                                  ^FVX                      ^GSPC  \
start_date   2023-12-25 00:00:00+00:00  2023-12-25 00:00:00+00:00   
end_date     2024-12-20 06:00:00+00:00  2024-12-20 06:00:00+00:00   
data_points                        250                        250   
missing_pct                      80.24                      80.24   
mean        

In [23]:
market_indicators_data = fill_missing_data(market_indicators_data)

# Generate and print summary statistics
summary_stats = generate_summary_stats(market_indicators_data)
print("\nSummary Statistics:")
print(summary_stats)
    
print("\nColumns in market indicators dataset:")
for col in market_indicators_data.columns:
    print(f"- {col}")

KeyError: 'Date'