In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import polars as pl
# Auto-reload modules when they change (helpful during development)
import importlib
import datapreprocessing
importlib.reload(datapreprocessing)
from datapreprocessing import DataPreprocessing


In [None]:
# Define paths
raw_data_path = '/Users/mpecaut/Desktop/data_parquet'
clean_data_path = '/Users/mpecaut/Fin_Big_Data/data_clean'

# Date filter (optional - if None, will use first 3 months from each asset)
start_date = '2008-09-01'  # Or None for auto
end_date = '2008-12-31'    # Or None for auto

# Initialize preprocessor
preprocessing = DataPreprocessing(folder_path=raw_data_path, 
                                  output_path=clean_data_path,
                                  start_date=start_date,
                                  end_date=end_date)

# Process all assets (this will create {ticker}_clean.parquet files)
# With date filtering, this should be MUCH faster!
preprocessing.process_all_assets()

# Load cleaned data
data_dict = preprocessing.load_cleaned_data()

print(f"\nLoaded {len(data_dict)} cleaned assets")
print(f"Example ticker: {list(data_dict.keys())[0]}")
print(f"\nExample data:")
print(data_dict[list(data_dict.keys())[0]].head())

Data already exists in '/Users/mpecaut/Fin_Big_Data/data_volbars'
Lezzzggooooo, found 84 processed files
Loading 84 processed assets...
  Loaded 10/84 assets...
  Loaded 20/84 assets...
  Loaded 30/84 assets...
  Loaded 40/84 assets...
  Loaded 50/84 assets...
  Loaded 60/84 assets...
  Loaded 70/84 assets...
  Loaded 80/84 assets...
  Loaded 84/84 assets...

✓ Successfully loaded 84 assets
Tickers: ['ALL.N', 'DOW.N', 'NKE.N', 'F.N', 'MON.N', 'BA.N', 'KO.N', 'JPM.N', 'C.N', 'CL.N']...

INDUSTRY MAPPING RESULTS:
communication_services :   4 stocks
consumer_cyclical    :   6 stocks
consumer_defensive   :  11 stocks
energy               :  11 stocks
financials           :  14 stocks
healthcare           :   9 stocks
industrials          :  14 stocks
materials            :   5 stocks
technology           :   6 stocks
utilities            :   4 stocks

Total stocks mapped: 84
Unknown stocks: 0

DATA CLEANING RESULTS:
Original data: 84 tickers (with suffix)
Cleaned data_clean: 84 tickers (wi

In [3]:
all = pl.read_parquet('panel_data_1min.parquet')

In [4]:
all

timestamp,ticker,ask-price,ask-volume,bid-price,bid-volume,spread,mid-price,volume_imbalance,industry
datetime[μs],str,f64,f64,f64,f64,f64,f64,f64,str
2008-09-02 13:30:00,"""ABT""",58.537504,605.0,58.309401,484.0,0.228103,58.423452,-0.111111,"""healthcare"""
2008-09-02 13:30:00,"""ALL""",45.75773,163.0,45.637612,67.0,0.120118,45.697671,-0.417391,"""financials"""
2008-09-02 13:30:00,"""BAC""",33.0075,13393.0,32.955497,8656.0,0.052003,32.981498,-0.21484,"""financials"""
2008-09-02 13:30:00,"""BAX""",68.484787,328.0,68.272418,153.0,0.212368,68.378602,-0.363825,"""healthcare"""
2008-09-02 13:30:00,"""BK""",35.3655,280.0,35.302484,153.0,0.063016,35.333992,-0.293303,"""financials"""
…,…,…,…,…,…,…,…,…,…
2008-12-30 21:55:00,"""T""",28.24,1065.0,28.19,1.0,0.05,28.215,-0.998124,"""communication_services"""
2008-12-30 21:55:00,"""WAG""",23.96,53.0,23.94,106.0,0.02,23.95,0.333333,"""consumer_defensive"""
2008-12-30 21:55:00,"""WFC""",28.82,1.0,28.7,59.0,0.12,28.76,0.966667,"""financials"""
2008-12-30 21:55:00,"""WMB""",14.33,415.0,14.31,606.0,0.02,14.32,0.187071,"""energy"""


In [5]:
# Analyze daily time availability

# Extract date and time components
daily_times = all.select([
    pl.col('timestamp').dt.date().alias('date'),
    pl.col('timestamp').dt.time().alias('time'),
    'timestamp'
])

# Get min and max time for each day
daily_range = all.group_by(pl.col('timestamp').dt.date().alias('date')).agg([
    pl.col('timestamp').min().alias('first_timestamp'),
    pl.col('timestamp').max().alias('last_timestamp'),
    pl.col('timestamp').count().alias('n_observations')
]).sort('date')

# Extract just the time portions for easier reading
daily_summary = daily_range.with_columns([
    pl.col('first_timestamp').dt.time().alias('start_time'),
    pl.col('last_timestamp').dt.time().alias('end_time'),
    (pl.col('last_timestamp') - pl.col('first_timestamp')).alias('duration')
])

print("Daily trading hours:")
print(daily_summary.select(['date', 'start_time', 'end_time', 'duration', 'n_observations']))

# Show summary statistics
print(f"\nMost common start time: {daily_summary['start_time'].mode()[0]}")
print(f"Most common end time: {daily_summary['end_time'].mode()[0]}")
print(f"Average observations per day: {daily_summary['n_observations'].mean():.0f}")

Daily trading hours:
shape: (84, 5)
┌────────────┬────────────┬──────────┬──────────────┬────────────────┐
│ date       ┆ start_time ┆ end_time ┆ duration     ┆ n_observations │
│ ---        ┆ ---        ┆ ---      ┆ ---          ┆ ---            │
│ date       ┆ time       ┆ time     ┆ duration[μs] ┆ u32            │
╞════════════╪════════════╪══════════╪══════════════╪════════════════╡
│ 2008-09-02 ┆ 13:30:00   ┆ 20:49:00 ┆ 7h 19m       ┆ 32847          │
│ 2008-09-03 ┆ 13:30:00   ┆ 20:49:00 ┆ 7h 19m       ┆ 32901          │
│ 2008-09-04 ┆ 13:30:00   ┆ 20:49:00 ┆ 7h 19m       ┆ 32915          │
│ 2008-09-05 ┆ 13:30:00   ┆ 21:03:00 ┆ 7h 33m       ┆ 32912          │
│ 2008-09-08 ┆ 13:30:00   ┆ 20:49:00 ┆ 7h 19m       ┆ 32911          │
│ …          ┆ …          ┆ …        ┆ …            ┆ …              │
│ 2008-12-23 ┆ 14:30:00   ┆ 21:49:00 ┆ 7h 19m       ┆ 33023          │
│ 2008-12-24 ┆ 14:30:00   ┆ 19:39:00 ┆ 5h 9m        ┆ 17768          │
│ 2008-12-26 ┆ 14:30:00   ┆ 21:51:00 ┆ 7h

### Covariance matrix 

In [7]:
cov = np.load('covariance_denoised.npy')

ticker
str
"""WY"""
"""MON"""
"""WMB"""
"""GS"""
"""NSC"""
…
"""MRK"""
"""MS"""
"""F"""
"""JNJ"""


## Example Factor Ideas:

### 1. **Cross-Sectional Factors** (relative to industry)
- Relative spread: How wide is this stock's spread vs industry average?
- Relative volume imbalance: Is buying pressure stronger than peers?
- Relative volatility: Is this stock more volatile than industry?

### 2. **Industry Momentum Factors**
- Industry return: Average return across all stocks in industry
- Industry spread widening: Is the whole industry becoming less liquid?

### 3. **Lead-Lag Relationships**
- Does one stock's price movement predict others in the industry?
- Order flow spillover effects

**Next Step:** Do you have an industry classification for your tickers, or do you need to create one?

In [None]:
# Step 2: Align data to common time grid for cross-sectional analysis
def align_to_time_grid(data_dict, freq='5min'):
    """
    Resample all volatility bars to a common time grid
    This allows cross-sectional comparison at each time point
    
    Args:
        data_dict: Dictionary of {ticker: DataFrame}
        freq: Time frequency for alignment ('5min', '15min', '1H', etc.)
    
    Returns:
        Dictionary of {ticker: aligned DataFrame}
    """
    aligned_data = {}
    
    for ticker, df in data_dict.items():
        # Calculate mid-price and spread
        df['mid_price'] = (df['bid-price_mean'] + df['ask-price_mean']) / 2
        df['spread'] = df['ask-price_mean'] - df['bid-price_mean']
        df['volume_imbalance'] = (df['bid-volume_sum'] - df['ask-volume_sum']) / (df['bid-volume_sum'] + df['ask-volume_sum'])
        
        # Resample to common time grid
        aligned = df.resample(freq).agg({
            'mid_price': 'last',
            'bid-price_mean': 'last',
            'ask-price_mean': 'last',
            'spread': 'mean',
            'bid-volume_sum': 'sum',
            'ask-volume_sum': 'sum',
            'volume_imbalance': 'mean',
            'bar_duration': 'mean'  # Average volatility regime
        }).dropna()
        
        aligned_data[ticker] = aligned
    
    return aligned_data

# Example: Align to 5-minute grid
aligned_data = align_to_time_grid(data_dict, freq='5min')
print(f"Aligned {len(aligned_data)} assets to common time grid")

In [None]:
from collections import defaultdict
industries = defaultdict(list)
for ticker, industry in industry_mapping.items():
    industries[industry].append(ticker)

print("Industries:")
for industry, tickers in industries.items():
    print(f"  {industry}: {len(tickers)} stocks")

Industries:
  financials: 14 stocks
  materials: 5 stocks
  consumer_cyclical: 6 stocks
  industrials: 14 stocks
  consumer_defensive: 11 stocks
  communication_services: 4 stocks
  healthcare: 9 stocks
  technology: 6 stocks
  energy: 11 stocks
  utilities: 4 stocks
