In [1]:
# Imports and Configuration
# Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dotenv
import sys
import os
from datetime import datetime, timedelta
import warnings
import importlib

project_root = os.path.abspath('..')
sys.path.insert(0, project_root)
print(f"Project Root: {project_root}")
import src.data as data
importlib.reload(data)

warnings.filterwarnings('ignore')
from src.cointegration import run_cointegration_analysis

PAIRS_TO_TEST = {
    'industrial_materials': ['XLI', 'XLB'],
    'finance_regional': ['XLF', 'KRE'],
    'tech_semis': ['XLK', 'SMH'],
    'energy_oil': ['XLE', 'USO'],
    'gold_miners': ['GLD', 'GDX'],
    'oil_producers': ['USO', 'XOP'],
    'coke_pepsi': ['KO', 'PEP'],
    'visa_mastercard': ['V', 'MA'],
    'home_depot_lowes': ['HD', 'LOW'],
    'tech_energy': ['XLK', 'XLE'],  # Expected to fail
}

LOOKBACK_DAYS = 800  # 2 years

print(f"Pairs to test: {len(PAIRS_TO_TEST)}")
print(f"Lookback: {LOOKBACK_DAYS} days (~{LOOKBACK_DAYS/252:.1f} years)")

Project Root: c:\code\quant_strategies\stat-arb
Pairs to test: 10
Lookback: 800 days (~3.2 years)


In [2]:
# Scoring function to rank pairs

def calculate_pair_score(results: dict) -> dict:
    """
    Score a pair from 0-100 based on tradability criteria.
    
    Returns dict with total_score, component_scores, and grade.
    """
    
    # Handle failed analyses
    if not results.get('success', False):
        return {
            'total_score': 0,
            'component_scores': {},
            'grade': 'F'
        }
    
    scores = {}
    
    # 1. Half-life (30 points) - MOST IMPORTANT
    hl = results.get('half_life', float('inf'))
    if 5 <= hl <= 30:
        scores['half_life'] = 30
    elif 3 <= hl < 5:
        scores['half_life'] = 20
    elif 30 < hl <= 45:
        scores['half_life'] = 20
    elif 45 < hl <= 60:
        scores['half_life'] = 10
    elif 2 <= hl < 3:
        scores['half_life'] = 10
    else:
        scores['half_life'] = 0
    
    # 2. ADF test (25 points)
    adf_p = results.get('adf_pvalue', 1.0)
    if adf_p < 0.01:
        scores['adf'] = 25
    elif adf_p < 0.05:
        scores['adf'] = 20
    elif adf_p < 0.10:
        scores['adf'] = 15
    elif adf_p < 0.20:
        scores['adf'] = 8
    else:
        scores['adf'] = 0
    
    # 3. Cointegration (20 points)
    n_trace = results.get('n_coint_trace', 0)
    n_eigen = results.get('n_coint_eigen', 0)
    if n_trace > 0 and n_eigen > 0:
        scores['cointegration'] = 20
    elif n_trace > 0 or n_eigen > 0:
        scores['cointegration'] = 12
    else:
        scores['cointegration'] = 0
    
    # 4. Hurst exponent (15 points)
    hurst = results.get('hurst', 0.5)
    if hurst < 0.4:
        scores['hurst'] = 15
    elif hurst < 0.5:
        scores['hurst'] = 10
    elif hurst < 0.55:
        scores['hurst'] = 5
    else:
        scores['hurst'] = 0
    
    # 5. KPSS test (10 points)
    if results.get('kpss_stationary', False):
        scores['kpss'] = 10
    else:
        scores['kpss'] = 0
    
    # Total and grade
    total = sum(scores.values())
    
    if total >= 85:
        grade = 'A'
    elif total >= 70:
        grade = 'B'
    elif total >= 55:
        grade = 'C'
    elif total >= 40:
        grade = 'D'
    else:
        grade = 'F'
    
    return {
        'total_score': total,
        'component_scores': scores,
        'grade': grade
    }

In [3]:
# Run cointegration analysis on all pairs

all_results = {}

for pair_name, tickers in PAIRS_TO_TEST.items():
    print(f"\n{'='*60}")
    print(f"Analyzing: {pair_name} ({tickers[0]} / {tickers[1]})")
    print('='*60)
    
    # Run analysis
    results = run_cointegration_analysis(
        tickers, 
        lookback_days=LOOKBACK_DAYS, 
        verbose=True,
        data_source= 'alpaca'
    )
    
    # Calculate score
    score_result = calculate_pair_score(results)
    results['score'] = score_result['total_score']
    results['grade'] = score_result['grade']
    results['component_scores'] = score_result['component_scores']
    
    all_results[pair_name] = results
    
    # Quick summary
    if results['success']:
        print(f"→ Score: {results['score']}/100 ({results['grade']})")
    else:
        print(f"→ FAILED: {results.get('error', 'Unknown error')}")

print("\n" + "="*60)
print("Analysis complete!")
print("="*60)


Analyzing: industrial_materials (XLI / XLB)
Loading data for ['XLI', 'XLB']...
Loading data for cointegration analysis...
  Tickers: ['XLI', 'XLB']
  Date range: 2022-06-05 to 2025-12-06
  Target observations: ~800 days

✓ Downloaded and log-transformed: 915 observations
✓ Data quality check: PASSED
  Clean tickers: 2

Final dataset:
  Shape: (915, 2)
  Date range: 2022-06-06 00:00:00 to 2025-12-05 00:00:00
  Tickers: ['XLI', 'XLB']
  DEBUG: Received DataFrame with 915 rows
Running Johansen test...
Running VAR decomposition...
Running stationarity tests...
Done. Half-life: 154.7 days, Cointegration: False
→ Score: 0/100 (F)

Analyzing: finance_regional (XLF / KRE)
Loading data for ['XLF', 'KRE']...
Loading data for cointegration analysis...
  Tickers: ['XLF', 'KRE']
  Date range: 2022-06-05 to 2025-12-06
  Target observations: ~800 days


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(spread, regression='c', nlags='auto')



✓ Downloaded and log-transformed: 915 observations
✓ Data quality check: PASSED
  Clean tickers: 2

Final dataset:
  Shape: (915, 2)
  Date range: 2022-06-06 00:00:00 to 2025-12-05 00:00:00
  Tickers: ['XLF', 'KRE']
  DEBUG: Received DataFrame with 915 rows
Running Johansen test...
Running VAR decomposition...
Running stationarity tests...
Done. Half-life: 82.1 days, Cointegration: False
→ Score: 0/100 (F)

Analyzing: tech_semis (XLK / SMH)
Loading data for ['XLK', 'SMH']...
Loading data for cointegration analysis...
  Tickers: ['XLK', 'SMH']
  Date range: 2022-06-05 to 2025-12-06
  Target observations: ~800 days


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(spread, regression='c', nlags='auto')
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(spread, regression='c', nlags='auto')



✓ Downloaded and log-transformed: 915 observations
✓ Data quality check: PASSED
  Clean tickers: 2

Final dataset:
  Shape: (915, 2)
  Date range: 2022-06-06 00:00:00 to 2025-12-05 00:00:00
  Tickers: ['XLK', 'SMH']
  DEBUG: Received DataFrame with 915 rows
Running Johansen test...
Running VAR decomposition...
Running stationarity tests...
Done. Half-life: 106.9 days, Cointegration: False
→ Score: 0/100 (F)

Analyzing: energy_oil (XLE / USO)
Loading data for ['XLE', 'USO']...
Loading data for cointegration analysis...
  Tickers: ['XLE', 'USO']
  Date range: 2022-06-05 to 2025-12-06
  Target observations: ~800 days


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(spread, regression='c', nlags='auto')
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(spread, regression='c', nlags='auto')



✓ Downloaded and log-transformed: 915 observations
✓ Data quality check: PASSED
  Clean tickers: 2

Final dataset:
  Shape: (915, 2)
  Date range: 2022-06-06 00:00:00 to 2025-12-05 00:00:00
  Tickers: ['XLE', 'USO']
  DEBUG: Received DataFrame with 915 rows
Running Johansen test...
Running VAR decomposition...
Running stationarity tests...
Done. Half-life: 16.8 days, Cointegration: True
→ Score: 75/100 (B)

Analyzing: gold_miners (GLD / GDX)
Loading data for ['GLD', 'GDX']...
Loading data for cointegration analysis...
  Tickers: ['GLD', 'GDX']
  Date range: 2022-06-05 to 2025-12-06
  Target observations: ~800 days


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(spread, regression='c', nlags='auto')



✓ Downloaded and log-transformed: 915 observations
✓ Data quality check: PASSED
  Clean tickers: 2

Final dataset:
  Shape: (915, 2)
  Date range: 2022-06-06 00:00:00 to 2025-12-05 00:00:00
  Tickers: ['GLD', 'GDX']
  DEBUG: Received DataFrame with 915 rows
Running Johansen test...
Running VAR decomposition...
Running stationarity tests...
Done. Half-life: 45.1 days, Cointegration: False
→ Score: 25/100 (F)

Analyzing: oil_producers (USO / XOP)
Loading data for ['USO', 'XOP']...
Loading data for cointegration analysis...
  Tickers: ['USO', 'XOP']
  Date range: 2022-06-05 to 2025-12-06
  Target observations: ~800 days


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(spread, regression='c', nlags='auto')
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(spread, regression='c', nlags='auto')



✓ Downloaded and log-transformed: 915 observations
✓ Data quality check: PASSED
  Clean tickers: 2

Final dataset:
  Shape: (915, 2)
  Date range: 2022-06-06 00:00:00 to 2025-12-05 00:00:00
  Tickers: ['USO', 'XOP']
  DEBUG: Received DataFrame with 915 rows
Running Johansen test...
Running VAR decomposition...
Running stationarity tests...
Done. Half-life: 16.5 days, Cointegration: True
→ Score: 85/100 (A)

Analyzing: coke_pepsi (KO / PEP)
Loading data for ['KO', 'PEP']...
Loading data for cointegration analysis...
  Tickers: ['KO', 'PEP']
  Date range: 2022-06-05 to 2025-12-06
  Target observations: ~800 days


look-up table. The actual p-value is greater than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(spread, regression='c', nlags='auto')



✓ Downloaded and log-transformed: 915 observations
✓ Data quality check: PASSED
  Clean tickers: 2

Final dataset:
  Shape: (915, 2)
  Date range: 2022-06-06 00:00:00 to 2025-12-05 00:00:00
  Tickers: ['KO', 'PEP']
  DEBUG: Received DataFrame with 915 rows
Running Johansen test...
Running VAR decomposition...
Running stationarity tests...
Done. Half-life: 25.9 days, Cointegration: False
→ Score: 60/100 (C)

Analyzing: visa_mastercard (V / MA)
Loading data for ['V', 'MA']...
Loading data for cointegration analysis...
  Tickers: ['V', 'MA']
  Date range: 2022-06-05 to 2025-12-06
  Target observations: ~800 days


look-up table. The actual p-value is greater than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(spread, regression='c', nlags='auto')



✓ Downloaded and log-transformed: 915 observations
✓ Data quality check: PASSED
  Clean tickers: 2

Final dataset:
  Shape: (915, 2)
  Date range: 2022-06-06 00:00:00 to 2025-12-05 00:00:00
  Tickers: ['V', 'MA']
  DEBUG: Received DataFrame with 915 rows
Running Johansen test...
Running VAR decomposition...
Running stationarity tests...
Done. Half-life: 28.7 days, Cointegration: False
→ Score: 60/100 (C)

Analyzing: home_depot_lowes (HD / LOW)
Loading data for ['HD', 'LOW']...
Loading data for cointegration analysis...
  Tickers: ['HD', 'LOW']
  Date range: 2022-06-05 to 2025-12-06
  Target observations: ~800 days


look-up table. The actual p-value is greater than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(spread, regression='c', nlags='auto')
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(spread, regression='c', nlags='auto')



✓ Downloaded and log-transformed: 915 observations
✓ Data quality check: PASSED
  Clean tickers: 2

Final dataset:
  Shape: (915, 2)
  Date range: 2022-06-06 00:00:00 to 2025-12-05 00:00:00
  Tickers: ['HD', 'LOW']
  DEBUG: Received DataFrame with 915 rows
Running Johansen test...
Running VAR decomposition...
Running stationarity tests...
Done. Half-life: 36.7 days, Cointegration: False
→ Score: 35/100 (F)

Analyzing: tech_energy (XLK / XLE)
Loading data for ['XLK', 'XLE']...
Loading data for cointegration analysis...
  Tickers: ['XLK', 'XLE']
  Date range: 2022-06-05 to 2025-12-06
  Target observations: ~800 days


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(spread, regression='c', nlags='auto')



✓ Downloaded and log-transformed: 915 observations
✓ Data quality check: PASSED
  Clean tickers: 2

Final dataset:
  Shape: (915, 2)
  Date range: 2022-06-06 00:00:00 to 2025-12-05 00:00:00
  Tickers: ['XLK', 'XLE']
  DEBUG: Received DataFrame with 915 rows
Running Johansen test...
Running VAR decomposition...
Running stationarity tests...
Done. Half-life: 17.1 days, Cointegration: False
→ Score: 45/100 (D)

Analysis complete!


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(spread, regression='c', nlags='auto')
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(spread, regression='c', nlags='auto')


In [7]:
summary_list = []
for pair, res in all_results.items():
    if res.get('success'):
        summary_list.append({
            'Pair Name': pair,
            'Tickers': f"{res['tickers'][0]}/{res['tickers'][1]}",
            'Score': res['score'],
            'Grade': res['grade'],
            'Half-Life': f"{res.get('half_life', 0):.1f}",
            'Cointegrated': res.get('cointegration_exists', False)
        })
    else:
        summary_list.append({
            'Pair Name': pair,
            'Tickers': "N/A",
            'Score': 0,
            'Grade': 'FAIL',
            'Half-Life': 'N/A',
            'Cointegrated': False
        })

# Create DataFrame and sort by Score (highest first)
df_summary = pd.DataFrame(summary_list)
df_summary = df_summary.sort_values(by='Score', ascending=False).reset_index(drop=True)

# Display
print("Strategy Screening Results:")
display(df_summary)

Strategy Screening Results:


Unnamed: 0,Pair Name,Tickers,Score,Grade,Half-Life,Cointegrated
0,oil_producers,USO/XOP,85,A,16.5,True
1,energy_oil,XLE/USO,75,B,16.8,True
2,visa_mastercard,V/MA,60,C,28.7,False
3,coke_pepsi,KO/PEP,60,C,25.9,False
4,tech_energy,XLK/XLE,45,D,17.1,False
5,home_depot_lowes,HD/LOW,35,F,36.7,False
6,gold_miners,GLD/GDX,25,F,45.1,False
7,industrial_materials,XLI/XLB,0,F,154.7,False
8,finance_regional,XLF/KRE,0,F,82.1,False
9,tech_semis,XLK/SMH,0,F,106.9,False
