# RLIC Enhancement: Data Exploration

This notebook fetches and explores data needed for Investment Clock analysis:
1. Price data from Yahoo Finance (equities, bonds, commodities)
2. Economic data from FRED (GDP, CPI, etc.)
3. Exploratory analysis and visualization

In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Project paths
PROJECT_ROOT = Path('.').parent
DATA_DIR = PROJECT_ROOT / 'data'
CACHE_DIR = PROJECT_ROOT / 'cache'

DATA_DIR.mkdir(exist_ok=True)
CACHE_DIR.mkdir(exist_ok=True)

print(f"Project root: {PROJECT_ROOT.absolute()}")

## 1. Fetch Price Data from Yahoo Finance

In [None]:
# Define key tickers for Investment Clock analysis
PRICE_TICKERS = {
    # Core asset classes
    'sp500': '^GSPC',           # US Stocks
    'treasury_10y': '^TNX',     # 10Y Treasury Yield
    'gold': 'GC=F',             # Gold Futures
    'crude_oil': 'CL=F',        # Crude Oil Futures
    
    # ETF proxies (longer history for some)
    'spy': 'SPY',               # S&P 500 ETF
    'tlt': 'TLT',               # 20+ Year Treasury ETF
    'gld': 'GLD',               # Gold ETF
    'dbc': 'DBC',               # Commodity Index ETF
    
    # Additional indices
    'nasdaq': '^IXIC',          # NASDAQ
    'russell2000': '^RUT',      # Small Cap
    'vix': '^VIX',              # Volatility Index
    'dxy': 'DX-Y.NYB',          # US Dollar Index
}

START_DATE = '1990-01-01'
END_DATE = datetime.now().strftime('%Y-%m-%d')

print(f"Fetching data from {START_DATE} to {END_DATE}")

In [None]:
# Fetch all price data
ticker_symbols = list(PRICE_TICKERS.values())
print(f"Downloading {len(ticker_symbols)} tickers...")

raw_data = yf.download(ticker_symbols, start=START_DATE, end=END_DATE, progress=True)
print(f"\nDownloaded data shape: {raw_data.shape}")

In [None]:
# Extract Close prices and rename columns
if isinstance(raw_data.columns, pd.MultiIndex):
    prices = raw_data['Close'].copy()
else:
    prices = raw_data.copy()

# Rename to friendly names
reverse_map = {v: k for k, v in PRICE_TICKERS.items()}
prices.columns = [reverse_map.get(c, c) for c in prices.columns]

print("Price data summary:")
print(prices.info())
print("\nFirst available dates:")
print(prices.apply(lambda x: x.first_valid_index()))

In [None]:
# Save price data
prices.to_parquet(DATA_DIR / 'prices.parquet')
print(f"Saved prices to {DATA_DIR / 'prices.parquet'}")

In [None]:
# Plot price histories
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# S&P 500
ax = axes[0, 0]
prices['sp500'].dropna().plot(ax=ax, title='S&P 500', color='blue')
ax.set_ylabel('Price')

# 10Y Treasury Yield
ax = axes[0, 1]
prices['treasury_10y'].dropna().plot(ax=ax, title='10Y Treasury Yield', color='green')
ax.set_ylabel('Yield (%)')

# Gold
ax = axes[1, 0]
prices['gold'].dropna().plot(ax=ax, title='Gold', color='gold')
ax.set_ylabel('Price ($/oz)')

# Crude Oil
ax = axes[1, 1]
prices['crude_oil'].dropna().plot(ax=ax, title='Crude Oil', color='brown')
ax.set_ylabel('Price ($/barrel)')

plt.tight_layout()
plt.savefig(DATA_DIR / 'price_histories.png', dpi=150)
plt.show()

## 2. Fetch Economic Data from FRED

Note: FRED API requires a free API key. Get one at: https://fred.stlouisfed.org/docs/api/api_key.html

If you don't have an API key yet, we'll use pandas-datareader as an alternative.

In [None]:
# Try to import fredapi, fall back to pandas-datareader
FRED_API_KEY = None  # Set your API key here if you have one

# Key FRED series for Investment Clock
FRED_SERIES = {
    # Growth indicators
    'gdp_real': 'GDPC1',              # Real GDP (Quarterly)
    'industrial_prod': 'INDPRO',       # Industrial Production (Monthly)
    'capacity_util': 'TCU',            # Capacity Utilization (Monthly)
    
    # Inflation indicators
    'cpi_all': 'CPIAUCSL',             # CPI All Urban (Monthly)
    'cpi_core': 'CPILFESL',            # Core CPI (Monthly)
    'pce_price': 'PCEPI',              # PCE Price Index (Monthly)
    
    # Labor market
    'unemployment': 'UNRATE',           # Unemployment Rate (Monthly)
    'nonfarm_payrolls': 'PAYEMS',      # Nonfarm Payrolls (Monthly)
    
    # Interest rates
    'fed_funds': 'FEDFUNDS',           # Fed Funds Rate (Monthly)
    'treasury_10y_rate': 'GS10',       # 10-Year Treasury (Monthly)
    'treasury_2y_rate': 'GS2',         # 2-Year Treasury (Monthly)
    'spread_10y2y': 'T10Y2Y',          # 10Y-2Y Spread (Daily)
    
    # Money supply & sentiment
    'm2': 'M2SL',                       # M2 Money Supply (Monthly)
    'consumer_sentiment': 'UMCSENT',   # Consumer Sentiment (Monthly)
    'leading_index': 'USSLIND',        # Leading Index (Monthly)
}

print(f"Will fetch {len(FRED_SERIES)} economic series")

In [None]:
# Use pandas-datareader to fetch FRED data (no API key needed for basic access)
import pandas_datareader as pdr

def fetch_fred_series(series_dict, start_date, end_date):
    """Fetch multiple FRED series using pandas-datareader."""
    results = {}
    
    for name, series_id in series_dict.items():
        try:
            data = pdr.get_data_fred(series_id, start=start_date, end=end_date)
            results[name] = data[series_id]
            print(f"  ✓ {name} ({series_id}): {len(data)} observations")
        except Exception as e:
            print(f"  ✗ {name} ({series_id}): {e}")
    
    return pd.DataFrame(results)

print("Fetching FRED data...")
fred_data = fetch_fred_series(FRED_SERIES, START_DATE, END_DATE)
print(f"\nFetched {len(fred_data.columns)} series")

In [None]:
# Summary of FRED data
print("FRED data summary:")
print(fred_data.info())
print("\nDate ranges:")
for col in fred_data.columns:
    valid = fred_data[col].dropna()
    if len(valid) > 0:
        print(f"  {col}: {valid.index[0].date()} to {valid.index[-1].date()}")

In [None]:
# Save FRED data
fred_data.to_parquet(DATA_DIR / 'fred_data.parquet')
print(f"Saved FRED data to {DATA_DIR / 'fred_data.parquet'}")

In [None]:
# Plot key economic indicators
fig, axes = plt.subplots(3, 2, figsize=(14, 12))

# Real GDP
ax = axes[0, 0]
if 'gdp_real' in fred_data.columns:
    fred_data['gdp_real'].dropna().plot(ax=ax, title='Real GDP', color='blue')
    ax.set_ylabel('Billions $')

# CPI YoY
ax = axes[0, 1]
if 'cpi_all' in fred_data.columns:
    cpi_yoy = fred_data['cpi_all'].pct_change(12) * 100
    cpi_yoy.dropna().plot(ax=ax, title='CPI YoY %', color='red')
    ax.axhline(y=2, color='gray', linestyle='--', alpha=0.5, label='2% target')
    ax.set_ylabel('YoY %')
    ax.legend()

# Unemployment
ax = axes[1, 0]
if 'unemployment' in fred_data.columns:
    fred_data['unemployment'].dropna().plot(ax=ax, title='Unemployment Rate', color='orange')
    ax.set_ylabel('%')

# Fed Funds Rate
ax = axes[1, 1]
if 'fed_funds' in fred_data.columns:
    fred_data['fed_funds'].dropna().plot(ax=ax, title='Fed Funds Rate', color='purple')
    ax.set_ylabel('%')

# 10Y-2Y Spread
ax = axes[2, 0]
if 'spread_10y2y' in fred_data.columns:
    fred_data['spread_10y2y'].dropna().plot(ax=ax, title='10Y-2Y Treasury Spread', color='green')
    ax.axhline(y=0, color='red', linestyle='--', alpha=0.7, label='Inversion')
    ax.set_ylabel('%')
    ax.legend()

# Industrial Production YoY
ax = axes[2, 1]
if 'industrial_prod' in fred_data.columns:
    ip_yoy = fred_data['industrial_prod'].pct_change(12) * 100
    ip_yoy.dropna().plot(ax=ax, title='Industrial Production YoY %', color='brown')
    ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
    ax.set_ylabel('YoY %')

plt.tight_layout()
plt.savefig(DATA_DIR / 'economic_indicators.png', dpi=150)
plt.show()

## 3. Compute Investment Clock Indicators

Key indicators for phase classification:
- **Growth**: Industrial Production YoY, GDP growth
- **Inflation**: CPI YoY

In [None]:
# Create monthly dataset for Investment Clock analysis
monthly = pd.DataFrame(index=pd.date_range(start='1990-01-01', end=END_DATE, freq='M'))

# Resample FRED data to month-end
fred_monthly = fred_data.resample('M').last()

# Calculate key indicators
if 'cpi_all' in fred_monthly.columns:
    monthly['cpi_yoy'] = fred_monthly['cpi_all'].pct_change(12) * 100

if 'industrial_prod' in fred_monthly.columns:
    monthly['ip_yoy'] = fred_monthly['industrial_prod'].pct_change(12) * 100

if 'gdp_real' in fred_monthly.columns:
    # GDP is quarterly, forward fill to monthly
    monthly['gdp_yoy'] = fred_monthly['gdp_real'].pct_change(4) * 100

# Add other indicators
for col in ['unemployment', 'fed_funds', 'spread_10y2y', 'consumer_sentiment']:
    if col in fred_monthly.columns:
        monthly[col] = fred_monthly[col]

# Resample prices to monthly
prices_monthly = prices.resample('M').last()

# Calculate monthly returns
monthly['sp500_ret'] = prices_monthly['sp500'].pct_change() * 100
monthly['gold_ret'] = prices_monthly['gold'].pct_change() * 100
monthly['oil_ret'] = prices_monthly['crude_oil'].pct_change() * 100

# For bonds, use inverse of yield change as return proxy (simplified)
if 'treasury_10y' in prices_monthly.columns:
    monthly['bond_ret'] = -prices_monthly['treasury_10y'].diff()  # Simplified proxy

print("Monthly dataset shape:", monthly.shape)
monthly.dropna(how='all').tail(10)

In [None]:
# Save monthly dataset
monthly.to_parquet(DATA_DIR / 'monthly_dataset.parquet')
print(f"Saved monthly dataset to {DATA_DIR / 'monthly_dataset.parquet'}")

## 4. Investment Clock Phase Classification

Traditional approach:
- **Growth**: Use momentum of Industrial Production (above/below 6M and 12M averages)
- **Inflation**: Use momentum of CPI YoY (above/below 6M and 12M averages)

| Phase | Growth | Inflation |
|-------|--------|----------|
| Reflation | Falling | Falling |
| Recovery | Rising | Falling |
| Overheat | Rising | Rising |
| Stagflation | Falling | Rising |

In [None]:
def compute_momentum_signal(series, short_window=6, long_window=12):
    """
    Compute momentum signal based on moving averages.
    Returns 1 if rising (above both MAs), -1 if falling (below both), 0 otherwise.
    """
    short_ma = series.rolling(short_window).mean()
    long_ma = series.rolling(long_window).mean()
    
    signal = pd.Series(0, index=series.index)
    signal[(series > short_ma) & (series > long_ma)] = 1   # Rising
    signal[(series < short_ma) & (series < long_ma)] = -1  # Falling
    
    return signal


def classify_investment_clock_phase(growth_signal, inflation_signal):
    """
    Classify Investment Clock phase based on growth and inflation signals.
    
    Growth signal: 1 = rising, -1 = falling
    Inflation signal: 1 = rising, -1 = falling
    """
    phase = pd.Series('Unknown', index=growth_signal.index)
    
    # Reflation: Growth falling, Inflation falling
    phase[(growth_signal == -1) & (inflation_signal == -1)] = 'Reflation'
    
    # Recovery: Growth rising, Inflation falling
    phase[(growth_signal == 1) & (inflation_signal == -1)] = 'Recovery'
    
    # Overheat: Growth rising, Inflation rising
    phase[(growth_signal == 1) & (inflation_signal == 1)] = 'Overheat'
    
    # Stagflation: Growth falling, Inflation rising
    phase[(growth_signal == -1) & (inflation_signal == 1)] = 'Stagflation'
    
    return phase

In [None]:
# Compute momentum signals
growth_signal = compute_momentum_signal(monthly['ip_yoy'].dropna())
inflation_signal = compute_momentum_signal(monthly['cpi_yoy'].dropna())

# Align indices
common_idx = growth_signal.index.intersection(inflation_signal.index)
growth_signal = growth_signal.loc[common_idx]
inflation_signal = inflation_signal.loc[common_idx]

# Classify phases
phases = classify_investment_clock_phase(growth_signal, inflation_signal)

# Add to monthly dataset
monthly['growth_signal'] = growth_signal
monthly['inflation_signal'] = inflation_signal
monthly['phase'] = phases

print("Phase distribution:")
print(phases.value_counts())

In [None]:
# Visualize phases over time
fig, axes = plt.subplots(4, 1, figsize=(14, 12), sharex=True)

# Define phase colors
phase_colors = {
    'Reflation': 'blue',
    'Recovery': 'green',
    'Overheat': 'red',
    'Stagflation': 'orange',
    'Unknown': 'gray'
}

# Plot 1: Growth indicator with signal
ax = axes[0]
ax.plot(monthly.index, monthly['ip_yoy'], 'b-', label='IP YoY %', alpha=0.7)
ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
ax.fill_between(monthly.index, 0, monthly['ip_yoy'], 
                where=monthly['growth_signal']==1, color='green', alpha=0.3, label='Rising')
ax.fill_between(monthly.index, 0, monthly['ip_yoy'], 
                where=monthly['growth_signal']==-1, color='red', alpha=0.3, label='Falling')
ax.set_ylabel('IP YoY %')
ax.set_title('Growth Indicator (Industrial Production YoY)')
ax.legend(loc='upper right')

# Plot 2: Inflation indicator with signal
ax = axes[1]
ax.plot(monthly.index, monthly['cpi_yoy'], 'r-', label='CPI YoY %', alpha=0.7)
ax.axhline(y=2, color='gray', linestyle='--', alpha=0.5, label='2% target')
ax.fill_between(monthly.index, 0, monthly['cpi_yoy'], 
                where=monthly['inflation_signal']==1, color='red', alpha=0.3, label='Rising')
ax.fill_between(monthly.index, 0, monthly['cpi_yoy'], 
                where=monthly['inflation_signal']==-1, color='blue', alpha=0.3, label='Falling')
ax.set_ylabel('CPI YoY %')
ax.set_title('Inflation Indicator (CPI YoY)')
ax.legend(loc='upper right')

# Plot 3: Investment Clock phase
ax = axes[2]
phase_numeric = phases.map({'Reflation': 1, 'Recovery': 2, 'Overheat': 3, 'Stagflation': 4, 'Unknown': 0})
for phase_name, color in phase_colors.items():
    mask = phases == phase_name
    if mask.any():
        ax.fill_between(phases.index, 0, 1, where=mask, color=color, alpha=0.6, label=phase_name)
ax.set_ylabel('Phase')
ax.set_title('Investment Clock Phase')
ax.legend(loc='upper right', ncol=5)
ax.set_yticks([])

# Plot 4: S&P 500
ax = axes[3]
sp500_aligned = prices_monthly['sp500'].loc[phases.index[0]:phases.index[-1]]
ax.plot(sp500_aligned.index, sp500_aligned, 'k-', label='S&P 500')
ax.set_ylabel('S&P 500')
ax.set_title('S&P 500 Index')
ax.set_xlabel('Date')

plt.tight_layout()
plt.savefig(DATA_DIR / 'investment_clock_phases.png', dpi=150)
plt.show()

## 5. Asset Performance by Phase

In [None]:
# Calculate average returns by phase
valid_data = monthly[['phase', 'sp500_ret', 'gold_ret', 'oil_ret', 'bond_ret']].dropna(subset=['phase'])
valid_data = valid_data[valid_data['phase'] != 'Unknown']

# Group by phase and calculate statistics
phase_stats = valid_data.groupby('phase').agg({
    'sp500_ret': ['mean', 'std', 'count'],
    'gold_ret': ['mean', 'std', 'count'],
    'oil_ret': ['mean', 'std', 'count'],
    'bond_ret': ['mean', 'std', 'count']
})

print("Average Monthly Returns by Phase (%):\n")
print(phase_stats.round(2))

In [None]:
# Simplified view of mean returns
mean_returns = valid_data.groupby('phase')[['sp500_ret', 'gold_ret', 'oil_ret', 'bond_ret']].mean()
mean_returns.columns = ['Stocks', 'Gold', 'Oil', 'Bonds']

# Reorder phases to match clock order
phase_order = ['Reflation', 'Recovery', 'Overheat', 'Stagflation']
mean_returns = mean_returns.reindex(phase_order)

print("Average Monthly Returns by Phase (%):\n")
print(mean_returns.round(3))

# Expected best assets (from theory)
print("\nExpected Best Asset per Phase:")
print("  Reflation: Bonds")
print("  Recovery: Stocks")
print("  Overheat: Commodities (Oil/Gold)")
print("  Stagflation: Cash")

In [None]:
# Visualize returns by phase
fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(len(phase_order))
width = 0.2

bars1 = ax.bar(x - 1.5*width, mean_returns['Stocks'], width, label='Stocks', color='blue')
bars2 = ax.bar(x - 0.5*width, mean_returns['Bonds'], width, label='Bonds', color='green')
bars3 = ax.bar(x + 0.5*width, mean_returns['Gold'], width, label='Gold', color='gold')
bars4 = ax.bar(x + 1.5*width, mean_returns['Oil'], width, label='Oil', color='brown')

ax.set_ylabel('Average Monthly Return (%)')
ax.set_title('Asset Class Performance by Investment Clock Phase')
ax.set_xticks(x)
ax.set_xticklabels(phase_order)
ax.legend()
ax.axhline(y=0, color='gray', linestyle='-', alpha=0.3)

# Add value labels on bars
for bars in [bars1, bars2, bars3, bars4]:
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3 if height >= 0 else -10),
                    textcoords="offset points",
                    ha='center', va='bottom' if height >= 0 else 'top',
                    fontsize=8)

plt.tight_layout()
plt.savefig(DATA_DIR / 'returns_by_phase.png', dpi=150)
plt.show()

In [None]:
# Rank assets by return in each phase
print("Asset Ranking by Phase (best to worst):\n")
for phase in phase_order:
    ranked = mean_returns.loc[phase].sort_values(ascending=False)
    print(f"{phase}: {' > '.join(ranked.index)}")

## 6. Summary Statistics

In [None]:
# Phase duration analysis
phase_changes = phases[phases != phases.shift(1)]
phase_durations = []

for i in range(len(phase_changes) - 1):
    start = phase_changes.index[i]
    end = phase_changes.index[i + 1]
    duration = (end - start).days / 30  # Approximate months
    phase_durations.append({
        'phase': phase_changes.iloc[i],
        'start': start,
        'end': end,
        'duration_months': duration
    })

duration_df = pd.DataFrame(phase_durations)
duration_df = duration_df[duration_df['phase'] != 'Unknown']

print("Average Phase Duration (months):")
print(duration_df.groupby('phase')['duration_months'].agg(['mean', 'std', 'min', 'max', 'count']).round(1))

In [None]:
# Save final datasets
monthly.to_parquet(DATA_DIR / 'monthly_with_phases.parquet')
duration_df.to_csv(DATA_DIR / 'phase_durations.csv', index=False)

print("Saved:")
print(f"  - {DATA_DIR / 'monthly_with_phases.parquet'}")
print(f"  - {DATA_DIR / 'phase_durations.csv'}")

## Summary

### Data Collected
- **Price data**: S&P 500, Treasury yields, Gold, Oil (from Yahoo Finance)
- **Economic data**: GDP, CPI, Industrial Production, Unemployment, Fed Funds, etc. (from FRED)

### Investment Clock Implementation
- Used Industrial Production YoY for growth signal
- Used CPI YoY for inflation signal
- Classified into 4 phases: Reflation, Recovery, Overheat, Stagflation

### Key Findings
- Phase distribution and duration statistics computed
- Asset performance by phase analyzed
- Comparison with theoretical expectations

### Next Steps (for ML Enhancement)
1. Add more indicators for phase classification
2. Implement regime detection using ML (HMM, clustering)
3. Build backtesting framework
4. Compare ML approach vs traditional approach