# WRDS OptionMetrics Data Exploration

This notebook demonstrates how to:
1. Connect to WRDS and fetch SPY option chain data
2. Clean and filter the data
3. Explore data quality and distributions
4. Save processed data for calibration

**Prerequisites:**
- Active WRDS subscription
- WRDS credentials configured
- `wrds` Python package installed: `pip install wrds`

In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Add parent directory to path
sys.path.insert(0, '..')

from ivpinn.data import (
    fetch_optionmetrics_wrds,
    fetch_security_price_wrds,
    fetch_zero_curve_wrds,
    clean_option_chain,
    compute_mid_price,
    print_data_summary,
    get_rate_interpolator
)

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("Imports successful!")

Imports successful!


## 1. Configuration

In [2]:
# Configuration
DATE = '2023-01-03'  # Trading date
SYMBOL = 'SPY'       # Ticker symbol

print(f"Date: {DATE}")
print(f"Symbol: {SYMBOL}")

Date: 2023-01-03
Symbol: SPY


## 2. Connect to WRDS

In [3]:
import wrds
import os
from dotenv import load_dotenv, find_dotenv

# Connect to WRDS
print("Connecting to WRDS...")

load_dotenv(find_dotenv())
db = wrds.Connection(wrds_username=os.getenv("WRDS_USERNAME")) # ~/.pgpass

print("✓ Connected successfully!")

Connecting to WRDS...
Loading library list...
Done
✓ Connected successfully!


## 3. Fetch Option Chain Data

In [4]:
# Fetch option chain
print(f"Fetching option chain for {SYMBOL} on {DATE}...")
df_options = db.raw_sql("SELECT * FROM optionm.optionmnames WHERE ticker = '"+SYMBOL+"'")
print(f"✓ Fetched {len(df_options)} option contracts")

# Display first few rows
df_options.head()

Fetching option chain for SPY on 2023-01-03...
✓ Fetched 525862 option contracts


Unnamed: 0,secid,symbol,optionid,root,suffix,effect_date,cusip,ticker,class,issuer,issue
0,7571.0,,,,,1996-10-28,81750M10,SPY,,SERENPET INC,
1,100155.0,,,,,2009-11-02,32299W10,SPY,I,SPDR TRUST SERIES 1,INTRADAY
2,109820.0,SFB.AA,31605683.0,SFB,AA,1996-01-02,78462F10,SPY,,SPDR TR,UNIT SER 1
3,109820.0,SFB.AB,31605685.0,SFB,AB,2000-11-28,78462F10,SPY,,SPDR TR,UNIT SER 1
4,109820.0,SFB.AC,31605687.0,SFB,AC,2010-01-28,78462F10,SPY,,SPDR S&P 500 ETF TR,UNIT SER 1 S&P


In [6]:
# Fetch spot price
print(f"Fetching security price for {SYMBOL}...")
price_info = fetch_security_price_wrds(db, DATE, SYMBOL)
S0 = price_info['S0']
print(f"✓ Spot price: S0 = ${S0:.2f}")

Fetching security price for SPY...


ValueError: No price data found for SPY on 2023-01-03

In [None]:
# Fetch zero curve (risk-free rates)
print("Fetching zero curve...")
df_zero_curve = fetch_zero_curve_wrds(db, DATE)
print(f"✓ Zero curve loaded: {len(df_zero_curve)} points")

# Display zero curve
df_zero_curve

## 4. Data Quality Check (Raw Data)

In [None]:
# Print raw data summary
print_data_summary(df_options, S0=S0)

In [None]:
# Visualize raw data distributions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Strike price distribution
axes[0, 0].hist(df_options['K'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].axvline(S0, color='red', linestyle='--', linewidth=2, label=f'S0=${S0:.2f}')
axes[0, 0].set_xlabel('Strike Price (K)')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_title('Strike Price Distribution')
axes[0, 0].legend()

# 2. Time to maturity distribution
axes[0, 1].hist(df_options['T'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 1].set_xlabel('Time to Maturity (years)')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_title('Time to Maturity Distribution')

# 3. Mid price distribution
axes[1, 0].hist(df_options['mid'], bins=50, edgecolor='black', alpha=0.7)
axes[1, 0].set_xlabel('Mid Price')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Option Mid Price Distribution')
axes[1, 0].set_xlim(0, df_options['mid'].quantile(0.95))  # Truncate for visibility

# 4. Moneyness distribution
moneyness = df_options['K'] / S0
axes[1, 1].hist(moneyness, bins=50, edgecolor='black', alpha=0.7)
axes[1, 1].axvline(1.0, color='red', linestyle='--', linewidth=2, label='ATM')
axes[1, 1].set_xlabel('Moneyness (K/S0)')
axes[1, 1].set_ylabel('Count')
axes[1, 1].set_title('Moneyness Distribution')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

## 5. Clean and Filter Data

In [None]:
# Clean option chain with filters
print("Cleaning option chain...")
df_clean = clean_option_chain(
    df_options,
    min_time_to_maturity=0.02,   # ~7 days
    max_time_to_maturity=2.0,    # 2 years
    min_moneyness=0.8,           # 20% OTM
    max_moneyness=1.2,           # 20% ITM
    min_volume=0,                # No volume filter
    min_open_interest=0,         # No OI filter
    only_calls=True,             # Keep only calls
    remove_missing_prices=True,
    S0=S0
)

print(f"✓ Cleaned: {len(df_options)} → {len(df_clean)} contracts")
print(f"  Reduction: {(1 - len(df_clean)/len(df_options))*100:.1f}%")

In [None]:
# Print cleaned data summary
print_data_summary(df_clean, S0=S0)

## 6. Visualize Cleaned Data

In [None]:
# Visualize cleaned data
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. K vs T scatter (option chain structure)
scatter = axes[0, 0].scatter(
    df_clean['T'], 
    df_clean['K'], 
    c=df_clean['mid'], 
    cmap='viridis',
    s=20,
    alpha=0.6
)
axes[0, 0].axhline(S0, color='red', linestyle='--', linewidth=2, label=f'S0=${S0:.2f}')
axes[0, 0].set_xlabel('Time to Maturity (years)')
axes[0, 0].set_ylabel('Strike Price (K)')
axes[0, 0].set_title('Option Chain: Strike vs Time')
axes[0, 0].legend()
plt.colorbar(scatter, ax=axes[0, 0], label='Mid Price')

# 2. Mid price vs strike (for different maturities)
# Select a few representative maturities
maturities = sorted(df_clean['T'].unique())
for T in maturities[::max(1, len(maturities)//5)][:5]:  # Select up to 5 maturities
    df_T = df_clean[df_clean['T'] == T]
    axes[0, 1].plot(df_T['K'], df_T['mid'], 'o-', label=f'T={T:.3f}yr', alpha=0.7)

axes[0, 1].axvline(S0, color='red', linestyle='--', linewidth=1, alpha=0.5)
axes[0, 1].set_xlabel('Strike Price (K)')
axes[0, 1].set_ylabel('Mid Price')
axes[0, 1].set_title('Price vs Strike (Selected Maturities)')
axes[0, 1].legend(fontsize=8)
axes[0, 1].grid(True, alpha=0.3)

# 3. Moneyness distribution (cleaned)
moneyness_clean = df_clean['K'] / S0
axes[1, 0].hist(moneyness_clean, bins=30, edgecolor='black', alpha=0.7)
axes[1, 0].axvline(1.0, color='red', linestyle='--', linewidth=2, label='ATM')
axes[1, 0].set_xlabel('Moneyness (K/S0)')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Moneyness Distribution (Cleaned)')
axes[1, 0].legend()

# 4. Contracts per maturity
maturity_counts = df_clean['T'].value_counts().sort_index()
axes[1, 1].bar(range(len(maturity_counts)), maturity_counts.values, alpha=0.7)
axes[1, 1].set_xlabel('Maturity Index')
axes[1, 1].set_ylabel('Number of Contracts')
axes[1, 1].set_title(f'Contracts per Maturity (Total: {len(maturity_counts)} maturities)')
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 7. Explore Market Implied Volatility

In [None]:
# Check if implied volatility is available
df_iv = df_clean[df_clean['impl_volatility'].notna()].copy()

if len(df_iv) > 0:
    print(f"Implied volatility available for {len(df_iv)} contracts ({len(df_iv)/len(df_clean)*100:.1f}%)")
    
    # Plot volatility smile for a few maturities
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Select representative maturities
    maturities_iv = sorted(df_iv['T'].unique())
    for T in maturities_iv[::max(1, len(maturities_iv)//4)][:4]:
        df_T = df_iv[df_iv['T'] == T]
        axes[0].plot(df_T['K'], df_T['impl_volatility'], 'o-', label=f'T={T:.3f}yr', alpha=0.7)
    
    axes[0].axvline(S0, color='red', linestyle='--', linewidth=1, alpha=0.5)
    axes[0].set_xlabel('Strike Price (K)')
    axes[0].set_ylabel('Implied Volatility')
    axes[0].set_title('Market Implied Volatility Smile')
    axes[0].legend(fontsize=8)
    axes[0].grid(True, alpha=0.3)
    
    # 3D-like scatter plot
    scatter = axes[1].scatter(
        df_iv['T'],
        df_iv['K'],
        c=df_iv['impl_volatility'],
        cmap='viridis',
        s=20,
        alpha=0.6
    )
    axes[1].set_xlabel('Time to Maturity (years)')
    axes[1].set_ylabel('Strike Price (K)')
    axes[1].set_title('IV Surface (Scatter)')
    plt.colorbar(scatter, ax=axes[1], label='Implied Volatility')
    
    plt.tight_layout()
    plt.show()
else:
    print("No implied volatility data available in this dataset")

## 8. Zero Curve Visualization

In [None]:
# Plot zero curve
plt.figure(figsize=(10, 5))
plt.plot(df_zero_curve['days']/365, df_zero_curve['rate'], 'o-', linewidth=2, markersize=8)
plt.xlabel('Time to Maturity (years)')
plt.ylabel('Rate (%)')
plt.title(f'Risk-Free Rate Zero Curve ({DATE})')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Test rate interpolation
r_func = get_rate_interpolator(df_zero_curve)
test_T = np.array([0.25, 0.5, 1.0, 1.5, 2.0])
test_r = r_func(test_T)
print("\nInterpolated rates:")
for T, r in zip(test_T, test_r):
    print(f"  T={T:.2f}yr: r={r*100:.3f}%")

## 9. Save Cleaned Data

In [None]:
# Add S0 to dataframe
df_clean['S0'] = S0

# Save to CSV
date_str = DATE.replace('-', '')
output_path = f'../data/processed/{SYMBOL}_{date_str}_clean.csv'

df_clean.to_csv(output_path, index=False)
print(f"✓ Saved cleaned data to: {output_path}")
print(f"  Contracts: {len(df_clean)}")
print(f"  Columns: {list(df_clean.columns)}")

## 10. Prepare Arrays for Calibration

In [None]:
# Extract arrays for calibration
K_array = df_clean['K'].to_numpy()
T_array = df_clean['T'].to_numpy()
C_mkt_array = df_clean['mid'].to_numpy()

print("Arrays prepared for calibration:")
print(f"  S0:      ${S0:.2f}")
print(f"  K shape: {K_array.shape}")
print(f"  T shape: {T_array.shape}")
print(f"  C shape: {C_mkt_array.shape}")
print(f"\nData ranges:")
print(f"  K:  [{K_array.min():.2f}, {K_array.max():.2f}]")
print(f"  T:  [{T_array.min():.4f}, {T_array.max():.4f}] years")
print(f"  C:  [{C_mkt_array.min():.2f}, {C_mkt_array.max():.2f}]")

## 11. Close WRDS Connection

In [None]:
# Close WRDS connection
db.close()
print("✓ WRDS connection closed")

## Summary

In this notebook, we:
1. ✅ Connected to WRDS OptionMetrics database
2. ✅ Fetched SPY option chain data for a specific date
3. ✅ Retrieved spot price and zero curve
4. ✅ Cleaned and filtered the data
5. ✅ Visualized data distributions and quality
6. ✅ Saved processed data for calibration

**Next steps:**
- Use the cleaned data in calibration module
- Implement implied volatility surface fitting
- Compare with Yahoo Finance data (for validation)