# ðŸ“Š Data Exploration - Quant Lab

Notebook untuk eksplorasi data dan validasi pipeline.

**FASE 0 - Foundation**

In [None]:
import sys
sys.path.insert(0, '../..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Quant Lab imports
from core.data_engine import DataDownloader, DataCleaner, DataValidator, DataStorage
from core.data_engine.base import DataConfig, DataSource, AssetClass, Resolution
from core.validation_engine import SharpeCalculator, PSRCalculator, DSRCalculator

plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

## 1. Download Data

Download data dari Yahoo Finance (backup source).

In [None]:
# Configure data download
config = DataConfig(
    source=DataSource.YAHOO,
    asset_class=AssetClass.EQUITY,
    symbols=['SPY', 'QQQ', 'TLT'],
    start_date='2015-01-01',
    resolution=Resolution.DAILY,
)

# Download
downloader = DataDownloader(config)
result = downloader.execute()

print(f"Downloaded {len(result.data)} rows")
result.data.head()

## 2. Clean Data

In [None]:
# Clean data
cleaner = DataCleaner(config)
clean_result = cleaner.clean(result.data)

print(f"Cleaned: {len(clean_result.data)} rows")
print(f"Outliers removed: {clean_result.outliers_removed}")
clean_result.data.head()

## 3. Validate Data

In [None]:
# Validate
validator = DataValidator(config)
validation = validator.validate_data(clean_result.data)

print(validation)
print(f"\nIssues: {validation.issues}")
print(f"Warnings: {validation.warnings}")

## 4. Calculate Returns & Metrics

In [None]:
# Calculate returns for SPY
spy_data = clean_result.data[clean_result.data['symbol'] == 'SPY'].copy()
spy_data = spy_data.sort_values('timestamp')
spy_data['returns'] = spy_data['close'].pct_change()

returns = spy_data['returns'].dropna()

# Sharpe Ratio
sharpe_calc = SharpeCalculator(periods_per_year=252)
sharpe_result = sharpe_calc.calculate(returns)
print(sharpe_result)

# PSR
psr_calc = PSRCalculator(benchmark_sr=0.0)
psr_result = psr_calc.calculate(returns)
print(f"\n{psr_result}")

## 5. Visualize

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Price
ax1 = axes[0, 0]
ax1.plot(spy_data['timestamp'], spy_data['close'])
ax1.set_title('SPY Price')
ax1.set_xlabel('Date')
ax1.set_ylabel('Price')

# Returns distribution
ax2 = axes[0, 1]
ax2.hist(returns, bins=50, edgecolor='black', alpha=0.7)
ax2.axvline(returns.mean(), color='red', linestyle='--', label=f'Mean: {returns.mean():.4f}')
ax2.set_title('Returns Distribution')
ax2.legend()

# Cumulative returns
ax3 = axes[1, 0]
cum_returns = (1 + returns).cumprod()
ax3.plot(spy_data['timestamp'].iloc[1:], cum_returns)
ax3.set_title('Cumulative Returns')

# Rolling Sharpe
ax4 = axes[1, 1]
rolling_sharpe = sharpe_calc.calculate_rolling(pd.Series(returns.values), window=252)
ax4.plot(spy_data['timestamp'].iloc[1:], rolling_sharpe)
ax4.axhline(0, color='red', linestyle='--')
ax4.set_title('Rolling 1-Year Sharpe Ratio')

plt.tight_layout()
plt.show()

## 6. Save Processed Data

In [None]:
# Save to storage
storage = DataStorage(base_path='../../data/processed')
path = storage.save(clean_result.data, name='equity_daily', source='yahoo')
print(f"Saved to: {path}")

# List datasets
for ds in storage.list_datasets():
    print(f"- {ds.name}: {ds.rows} rows, {ds.file_size_mb:.2f} MB")