# Data Download and Preparation

This notebook downloads Indian stock market data and prepares it for volatility modeling.

## Key Features:
- NSE/BSE stock data download
- VIX data integration
- FII/DII flow data
- Options chain processing
- Data quality checks
- Initial volatility calculations

In [None]:
import sys
import os

# Add project root to path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully")
print(f"Current working directory: {os.getcwd()}")
print(f"Project root: {project_root}")

In [None]:
# Import project modules
from src.config.settings import get_config
from src.data.nse_data import NSEDataSource
from src.data.vix_data import VIXDataSource
from src.data.options_data import OptionsDataSource
from src.utils.logging import get_logger
from src.features.volatility_estimators import VolatilityEstimators

# Initialize configuration and logger
config = get_config()
logger = get_logger(__name__)

print("Project modules imported successfully")
print(f"Data directory: {config.data.raw_data_path}")
print(f"Processing directory: {config.data.processed_data_path}")

## 1. Download NSE Stock Data

Download historical data for major Indian stocks and indices.

In [None]:
# Initialize NSE data source
nse_source = NSEDataSource()

# Define symbols to download
symbols = [
    'NIFTY50.NS',    # Nifty 50 Index
    'NIFTYBANK.NS',  # Bank Nifty
    'RELIANCE.NS',   # Reliance Industries
    'TCS.NS',        # TCS
    'INFY.NS',       # Infosys
    'HDFC.NS',       # HDFC
    'ICICIBANK.NS',  # ICICI Bank
    'SBIN.NS',       # State Bank of India
    'ITC.NS',        # ITC
    'LT.NS'          # Larsen & Toubro
]

# Date range for data download
end_date = datetime.now()
start_date = end_date - timedelta(days=365*3)  # 3 years of data

print(f"Downloading data from {start_date.date()} to {end_date.date()}")
print(f"Symbols: {symbols}")

In [None]:
# Download stock data
stock_data = {}
failed_downloads = []

for symbol in symbols:
    try:
        print(f"Downloading {symbol}...")
        data = nse_source.get_historical_data(
            symbol=symbol,
            start_date=start_date,
            end_date=end_date
        )
        
        if not data.empty:
            stock_data[symbol] = data
            print(f"  ✓ Downloaded {len(data)} records for {symbol}")
        else:
            print(f"  ✗ No data found for {symbol}")
            failed_downloads.append(symbol)
            
    except Exception as e:
        print(f"  ✗ Failed to download {symbol}: {e}")
        failed_downloads.append(symbol)

print(f"\nSuccessfully downloaded data for {len(stock_data)} symbols")
if failed_downloads:
    print(f"Failed downloads: {failed_downloads}")

In [None]:
# Display sample data
if stock_data:
    sample_symbol = list(stock_data.keys())[0]
    sample_data = stock_data[sample_symbol]
    
    print(f"Sample data for {sample_symbol}:")
    print(f"Shape: {sample_data.shape}")
    print(f"Date range: {sample_data.index[0]} to {sample_data.index[-1]}")
    print(f"Columns: {list(sample_data.columns)}")
    print("\nFirst 5 rows:")
    display(sample_data.head())
    
    print("\nLast 5 rows:")
    display(sample_data.tail())

## 2. Download VIX Data

Download India VIX (volatility index) data.

In [None]:
# Initialize VIX data source
vix_source = VIXDataSource()

try:
    print("Downloading India VIX data...")
    vix_data = vix_source.get_vix_data(
        start_date=start_date,
        end_date=end_date
    )
    
    if not vix_data.empty:
        print(f"✓ Downloaded {len(vix_data)} VIX records")
        print(f"VIX date range: {vix_data.index[0]} to {vix_data.index[-1]}")
        print(f"VIX columns: {list(vix_data.columns)}")
        display(vix_data.head())
    else:
        print("✗ No VIX data found")
        vix_data = None
        
except Exception as e:
    print(f"✗ Failed to download VIX data: {e}")
    vix_data = None

## 3. Process Options Chain Data

Process options chain data for put-call ratio and implied volatility analysis.

In [None]:
# Check for options chain file
options_file = os.path.join(project_root, 'option-chain-ED-NIFTY-02-Sep-2025.csv')

if os.path.exists(options_file):
    print(f"Found options chain file: {options_file}")
    
    # Load options data
    options_raw = pd.read_csv(options_file)
    print(f"Options data shape: {options_raw.shape}")
    print(f"Options columns: {list(options_raw.columns)}")
    display(options_raw.head())
    
else:
    print(f"Options chain file not found at: {options_file}")
    options_raw = None

In [None]:
# Process options data if available
options_processed = None

if options_raw is not None:
    try:
        options_source = OptionsDataSource()
        
        # Process the options chain data
        print("Processing options chain data...")
        
        # Basic processing - calculate put-call ratios, implied volatility, etc.
        options_processed = options_source.process_options_chain(options_raw)
        
        if options_processed is not None:
            print(f"✓ Processed options data shape: {options_processed.shape}")
            print(f"Processed columns: {list(options_processed.columns)}")
            display(options_processed.head())
        
    except Exception as e:
        print(f"✗ Failed to process options data: {e}")
        options_processed = None

## 4. Data Quality Checks

Perform comprehensive data quality checks.

In [None]:
# Data quality summary
print("=== DATA QUALITY SUMMARY ===")
print(f"Total symbols downloaded: {len(stock_data)}")

for symbol, data in stock_data.items():
    print(f"\n{symbol}:")
    print(f"  Records: {len(data)}")
    print(f"  Date range: {data.index[0].date()} to {data.index[-1].date()}")
    print(f"  Missing values: {data.isnull().sum().sum()}")
    print(f"  Zero volumes: {(data['Volume'] == 0).sum() if 'Volume' in data.columns else 'N/A'}")
    
    # Check for outliers in price movements
    if 'Close' in data.columns:
        returns = data['Close'].pct_change()
        outliers = np.abs(returns) > 0.15  # >15% daily moves
        print(f"  Large moves (>15%): {outliers.sum()}")
        
        if outliers.sum() > 0:
            print(f"  Extreme move dates: {data.index[outliers].tolist()[:5]}")

In [None]:
# Visualize data availability
if stock_data:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Plot 1: Price movements for major indices
    ax1 = axes[0, 0]
    for symbol in ['NIFTY50.NS', 'NIFTYBANK.NS']:
        if symbol in stock_data and 'Close' in stock_data[symbol].columns:
            data = stock_data[symbol]['Close']
            normalized = data / data.iloc[0] * 100
            ax1.plot(normalized.index, normalized.values, label=symbol, linewidth=1.5)
    
    ax1.set_title('Normalized Index Performance')
    ax1.set_ylabel('Normalized Price (Base=100)')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Volume patterns
    ax2 = axes[0, 1]
    if 'NIFTY50.NS' in stock_data and 'Volume' in stock_data['NIFTY50.NS'].columns:
        volume_data = stock_data['NIFTY50.NS']['Volume']
        ax2.plot(volume_data.index, volume_data.values, alpha=0.7)
        ax2.set_title('Trading Volume (NIFTY50)')
        ax2.set_ylabel('Volume')
        ax2.grid(True, alpha=0.3)
    
    # Plot 3: VIX if available
    ax3 = axes[1, 0]
    if vix_data is not None and not vix_data.empty:
        ax3.plot(vix_data.index, vix_data.iloc[:, 0], color='red', linewidth=2)
        ax3.set_title('India VIX')
        ax3.set_ylabel('VIX Level')
        ax3.grid(True, alpha=0.3)
    else:
        ax3.text(0.5, 0.5, 'VIX Data\nNot Available', 
                ha='center', va='center', transform=ax3.transAxes, fontsize=12)
        ax3.set_title('India VIX (Not Available)')
    
    # Plot 4: Data availability heatmap
    ax4 = axes[1, 1]
    if stock_data:
        # Create availability matrix
        all_dates = pd.date_range(
            start=min(data.index.min() for data in stock_data.values()),
            end=max(data.index.max() for data in stock_data.values()),
            freq='D'
        )
        
        availability = pd.DataFrame(index=all_dates, columns=list(stock_data.keys()))
        
        for symbol, data in stock_data.items():
            availability.loc[data.index, symbol] = 1
        
        availability = availability.fillna(0)
        
        # Sample for visualization (every 7th day)
        sample_availability = availability.iloc[::7]
        
        im = ax4.imshow(sample_availability.T, aspect='auto', cmap='RdYlGn', alpha=0.8)
        ax4.set_title('Data Availability')
        ax4.set_ylabel('Symbols')
        ax4.set_yticks(range(len(stock_data)))
        ax4.set_yticklabels([s.replace('.NS', '') for s in stock_data.keys()], fontsize=8)
    
    plt.tight_layout()
    plt.show()

print("Data quality visualization completed.")

## 5. Calculate Initial Volatility Estimates

Calculate various volatility estimators for the downloaded data.

In [None]:
# Initialize volatility estimators
vol_estimator = VolatilityEstimators()

# Calculate volatility estimates for NIFTY50
if 'NIFTY50.NS' in stock_data:
    nifty_data = stock_data['NIFTY50.NS']
    
    print("Calculating volatility estimates for NIFTY50...")
    
    # Simple returns volatility
    returns = nifty_data['Close'].pct_change().dropna()
    simple_vol = vol_estimator.simple_volatility(returns, window=30)
    
    print(f"✓ Simple volatility calculated: {len(simple_vol)} observations")
    
    # High-frequency estimators (if OHLC data available)
    required_cols = ['Open', 'High', 'Low', 'Close']
    if all(col in nifty_data.columns for col in required_cols):
        
        # Parkinson estimator
        parkinson_vol = vol_estimator.parkinson_estimator(
            nifty_data['High'], nifty_data['Low']
        )
        
        # Garman-Klass estimator
        gk_vol = vol_estimator.garman_klass_estimator(
            nifty_data['Open'], nifty_data['High'], 
            nifty_data['Low'], nifty_data['Close']
        )
        
        # Rogers-Satchell estimator
        rs_vol = vol_estimator.rogers_satchell_estimator(
            nifty_data['Open'], nifty_data['High'], 
            nifty_data['Low'], nifty_data['Close']
        )
        
        # Yang-Zhang estimator
        yz_vol = vol_estimator.yang_zhang_estimator(
            nifty_data['Open'], nifty_data['High'], 
            nifty_data['Low'], nifty_data['Close']
        )
        
        print("✓ All volatility estimators calculated")
        
        # Combine volatility estimates
        volatility_df = pd.DataFrame({
            'Simple': simple_vol,
            'Parkinson': parkinson_vol,
            'Garman_Klass': gk_vol,
            'Rogers_Satchell': rs_vol,
            'Yang_Zhang': yz_vol
        })
        
        print(f"Volatility estimates shape: {volatility_df.shape}")
        display(volatility_df.head())
        
    else:
        print("⚠ OHLC data not complete, using simple volatility only")
        volatility_df = pd.DataFrame({'Simple': simple_vol})

else:
    print("⚠ NIFTY50 data not available for volatility calculation")
    volatility_df = None

In [None]:
# Plot volatility estimates comparison
if volatility_df is not None and len(volatility_df.columns) > 1:
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Plot 1: Time series of all volatility estimates
    ax1 = axes[0, 0]
    for col in volatility_df.columns:
        ax1.plot(volatility_df.index, volatility_df[col] * 100, 
                label=col, linewidth=1.5, alpha=0.8)
    
    ax1.set_title('Volatility Estimates Comparison (%)')
    ax1.set_ylabel('Annualized Volatility (%)')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Correlation heatmap
    ax2 = axes[0, 1]
    corr_matrix = volatility_df.corr()
    im = ax2.imshow(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1)
    ax2.set_title('Volatility Estimator Correlations')
    ax2.set_xticks(range(len(corr_matrix.columns)))
    ax2.set_yticks(range(len(corr_matrix.columns)))
    ax2.set_xticklabels(corr_matrix.columns, rotation=45)
    ax2.set_yticklabels(corr_matrix.columns)
    
    # Add correlation values
    for i in range(len(corr_matrix)):
        for j in range(len(corr_matrix)):
            ax2.text(j, i, f'{corr_matrix.iloc[i, j]:.2f}', 
                    ha='center', va='center', fontsize=8)
    
    plt.colorbar(im, ax=ax2)
    
    # Plot 3: Distribution comparison
    ax3 = axes[1, 0]
    volatility_df.plot(kind='hist', bins=30, alpha=0.7, ax=ax3)
    ax3.set_title('Volatility Distribution')
    ax3.set_xlabel('Volatility')
    ax3.set_ylabel('Frequency')
    
    # Plot 4: Summary statistics
    ax4 = axes[1, 1]
    summary_stats = volatility_df.describe()
    
    # Create summary table
    ax4.axis('tight')
    ax4.axis('off')
    
    table_data = summary_stats.round(4)
    table = ax4.table(cellText=table_data.values,
                     rowLabels=table_data.index,
                     colLabels=table_data.columns,
                     cellLoc='center',
                     loc='center')
    
    table.auto_set_font_size(False)
    table.set_fontsize(8)
    table.scale(1, 1.5)
    ax4.set_title('Summary Statistics')
    
    plt.tight_layout()
    plt.show()

elif volatility_df is not None:
    # Simple plot for single volatility estimate
    plt.figure(figsize=(12, 6))
    plt.plot(volatility_df.index, volatility_df.iloc[:, 0] * 100, linewidth=2)
    plt.title('NIFTY50 Volatility (30-day rolling)')
    plt.ylabel('Annualized Volatility (%)')
    plt.grid(True, alpha=0.3)
    plt.show()

print("Volatility analysis completed.")

## 6. Save Processed Data

Save all processed data for use in subsequent notebooks.

In [None]:
# Create data directories if they don't exist
raw_data_dir = os.path.join(project_root, 'data', 'raw')
processed_data_dir = os.path.join(project_root, 'data', 'processed')

os.makedirs(raw_data_dir, exist_ok=True)
os.makedirs(processed_data_dir, exist_ok=True)

print(f"Data directories:")
print(f"  Raw: {raw_data_dir}")
print(f"  Processed: {processed_data_dir}")

In [None]:
# Save stock data
saved_files = []

for symbol, data in stock_data.items():
    filename = f"{symbol.replace('.NS', '')}_stock_data.csv"
    filepath = os.path.join(raw_data_dir, filename)
    
    data.to_csv(filepath)
    saved_files.append(filepath)
    print(f"✓ Saved {symbol} data to {filename}")

# Save VIX data
if vix_data is not None and not vix_data.empty:
    vix_filepath = os.path.join(raw_data_dir, 'india_vix_data.csv')
    vix_data.to_csv(vix_filepath)
    saved_files.append(vix_filepath)
    print(f"✓ Saved VIX data to india_vix_data.csv")

# Save options data
if options_processed is not None:
    options_filepath = os.path.join(processed_data_dir, 'options_processed.csv')
    options_processed.to_csv(options_filepath)
    saved_files.append(options_filepath)
    print(f"✓ Saved processed options data to options_processed.csv")

# Save volatility estimates
if volatility_df is not None:
    vol_filepath = os.path.join(processed_data_dir, 'nifty50_volatility_estimates.csv')
    volatility_df.to_csv(vol_filepath)
    saved_files.append(vol_filepath)
    print(f"✓ Saved volatility estimates to nifty50_volatility_estimates.csv")

print(f"\nTotal files saved: {len(saved_files)}")

In [None]:
# Create data summary report
summary_report = {
    'download_timestamp': datetime.now().isoformat(),
    'data_period': {
        'start_date': start_date.isoformat(),
        'end_date': end_date.isoformat()
    },
    'symbols_downloaded': list(stock_data.keys()),
    'failed_downloads': failed_downloads,
    'vix_available': vix_data is not None and not vix_data.empty,
    'options_available': options_processed is not None,
    'volatility_estimates_available': volatility_df is not None,
    'data_files': saved_files,
    'data_quality': {
        symbol: {
            'records': len(data),
            'missing_values': data.isnull().sum().sum(),
            'date_range': [data.index[0].isoformat(), data.index[-1].isoformat()]
        }
        for symbol, data in stock_data.items()
    }
}

# Save summary report
import json
summary_filepath = os.path.join(processed_data_dir, 'data_download_summary.json')
with open(summary_filepath, 'w') as f:
    json.dump(summary_report, f, indent=2)

print(f"✓ Saved data summary report to data_download_summary.json")
print("\n=== DATA DOWNLOAD COMPLETED ===")
print(f"Successfully downloaded and processed data for {len(stock_data)} symbols")
print(f"All data saved to: {processed_data_dir}")