# Data Exploration

This notebook explores the price and fundamental data used in the multi-factor strategy.

## Objectives
1. Load and examine price data
2. Explore fundamental data availability
3. Check data quality and completeness
4. Visualize data distributions


In [None]:
import sys
from pathlib import Path

# Add src to path
project_root = Path().resolve().parent.parent
sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_loader import load_universe, download_price_data, download_fundamental_data
from src.utils.config import BACKTEST_CONFIG, DATA_CONFIG

# Set style
plt.style.use('seaborn-v0_8' if 'seaborn-v0_8' in plt.style.available else 'default')
sns.set_palette("husl")


## 1. Load Universe


In [None]:
universe_df = load_universe()
print(f"Universe size: {len(universe_df)} tickers")
print(f"\nFirst few tickers:")
print(universe_df.head(10))


## 2. Load Price Data (Sample)


In [None]:
# Load a sample of tickers for exploration
sample_tickers = universe_df['ticker'].head(20).tolist()

# Extend start date for momentum calculation
data_start_date = pd.Timestamp(BACKTEST_CONFIG.start_date) - pd.DateOffset(months=15)
data_start_date_str = data_start_date.strftime('%Y-%m-%d')

price_data = download_price_data(
    sample_tickers,
    data_start_date_str,
    BACKTEST_CONFIG.end_date,
    cache=True
)

print(f"Price data shape: {price_data.shape}")
print(f"\nDate range: {price_data.index.get_level_values('date').min()} to {price_data.index.get_level_values('date').max()}")
print(f"\nColumns: {price_data.columns.tolist()}")
print(f"\nSample data:")
print(price_data.head(10))


## 3. Price Data Statistics


In [None]:
# Compute returns
returns = price_data['adj_close'].pct_change()

# Summary statistics
print("Price Statistics:")
print(price_data['adj_close'].describe())

print("\nReturn Statistics:")
print(returns.describe())

# Missing data
print("\nMissing Data:")
print(price_data.isnull().sum())


## 4. Visualize Price Data


In [None]:
# Plot cumulative returns for sample stocks
fig, ax = plt.subplots(figsize=(14, 8))

for ticker in sample_tickers[:5]:  # First 5 tickers
    ticker_prices = price_data.loc[(slice(None), ticker), 'adj_close']
    if not ticker_prices.empty:
        cumret = (1 + ticker_prices.pct_change()).cumprod()
        ax.plot(cumret.index, cumret.values, label=ticker, alpha=0.7)

ax.set_xlabel('Date')
ax.set_ylabel('Cumulative Return')
ax.set_title('Sample Stock Cumulative Returns')
ax.legend()
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
