In [37]:
%pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.


In [38]:
# Core data handling and analysis
import pandas as pd
import numpy as np

# Data fetching
import yfinance as yf

# Statistical testing
from statsmodels.tsa.stattools import coint

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Helper for looping through pairs
from itertools import combinations

# Setting plot style for better visuals
sns.set(style='whitegrid')
print("Libraries imported successfully!")

Libraries imported successfully!


In [39]:
      
# Using the 30 stocks in the Dow Jones Industrial Average (as of a certain date)
# A small, stable universe is good for prototyping
tickers = [
    'AXP', 'AMGN', 'AAPL', 'BA', 'CAT', 'CSCO', 'CVX', 'GS', 'HD', 'HON',
    'IBM', 'INTC', 'JNJ', 'JPM', 'MCD', 'MMM', 'MRK', 'MSFT', 'NKE', 'PG',
    'TRV', 'UNH', 'CRM', 'VZ', 'V', 'WBA', 'WMT', 'DIS', 'DOW', 'KO'
]

# Define the historical period for our analysis
start_date = '2018-01-01'
end_date = '2023-12-31'

    

In [43]:
prices_df = yf.download(["AAPL"], start=start_date, end=end_date)
prices_df

  prices_df = yf.download(["AAPL"], start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2018-01-02,40.426823,40.436212,39.722768,39.933986,102223600
2018-01-03,40.419788,40.964259,40.356426,40.490195,118071600
2018-01-04,40.607536,40.710798,40.384586,40.492539,89738400
2018-01-05,41.069866,41.156698,40.612231,40.703758,94640000
2018-01-08,40.917313,41.213014,40.818742,40.917313,82271200
...,...,...,...,...,...
2023-12-22,192.192551,193.989390,191.567126,193.761051,37149600
2023-12-26,191.646561,192.480450,191.428159,192.202487,28919300
2023-12-27,191.745819,192.093281,189.700797,191.090629,48087700
2023-12-28,192.172714,193.244865,191.765691,192.728641,34049900


In [44]:
# Download adjusted close prices
prices_df = yf.download(tickers, start=start_date, end=end_date)['Close']

# Drop any stocks that have missing data for the period
prices_df.dropna(axis=1, inplace=True)

print("Successfully downloaded and cleaned price data.")
print(f"Shape of DataFrame: {prices_df.shape}")
prices_df.head()

  prices_df = yf.download(tickers, start=start_date, end=end_date)['Close']
[*********************100%***********************]  30 of 30 completed


Successfully downloaded and cleaned price data.
Shape of DataFrame: (1509, 29)


Ticker,AAPL,AMGN,AXP,BA,CAT,CRM,CSCO,CVX,DIS,GS,...,MRK,MSFT,NKE,PG,TRV,UNH,V,VZ,WBA,WMT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02,40.426823,140.852753,88.993233,282.886414,132.230484,103.503059,30.781157,92.35302,106.943245,216.480911,...,42.677147,79.328545,57.970127,73.854904,111.846466,197.135818,108.74041,35.41217,53.94978,29.041031
2018-01-03,40.419788,143.510651,89.541901,283.801239,132.43251,104.375404,31.026703,93.026222,107.402374,214.465683,...,42.616417,79.697708,57.960979,73.765282,112.202393,199.203781,109.823006,34.684475,54.352867,29.294352
2018-01-04,40.607536,142.905899,91.031258,282.724396,134.251297,105.753334,31.114481,92.736664,107.354553,217.463135,...,43.307209,80.399162,57.924461,74.286728,112.202393,200.068451,110.231346,34.796955,51.538414,29.32086
2018-01-05,41.069866,143.757339,91.238869,294.322296,136.373199,107.160995,31.545408,92.584648,106.771065,216.353836,...,43.261662,81.395958,58.417511,74.335602,111.999008,203.88353,112.871254,34.717564,52.48856,29.494659
2018-01-08,40.917313,143.71756,90.399391,295.570801,139.800217,107.914398,31.872601,93.040718,105.240555,213.21257,...,43.011154,81.479027,58.93795,74.726692,111.397301,200.344818,113.327065,34.658024,53.15799,29.930614


In [45]:
def find_cointegrated_pairs(dataframe):
    """
    Scans a dataframe of prices to find cointegrated pairs.
    Returns a list of tuples, where each tuple contains (stock1, stock2, p_value).
    """
    n = dataframe.shape[1]
    keys = dataframe.keys()
    cointegrated_pairs = []

    # Use itertools.combinations to efficiently get all unique pairs
    for i, j in combinations(keys, 2):
        stock1_prices = dataframe[i]
        stock2_prices = dataframe[j]
        
        # Run the cointegration test
        score, p_value, _ = coint(stock1_prices, stock2_prices)
        
        # If p-value is less than our threshold, we consider them cointegrated
        if p_value < 0.05:
            cointegrated_pairs.append((i, j, p_value))
            
    return cointegrated_pairs

# Run the finder function
found_pairs = find_cointegrated_pairs(prices_df)

print(f"Found {len(found_pairs)} cointegrated pairs with p-value < 0.05:")
# Sort by p-value to see the most statistically significant pairs first
found_pairs.sort(key=lambda x: x[2])
for pair in found_pairs:
    print(f"Pair: ({pair[0]}, {pair[1]}), P-value: {pair[2]:.4f}")

Found 13 cointegrated pairs with p-value < 0.05:
Pair: (V, WMT), P-value: 0.0001
Pair: (MMM, MRK), P-value: 0.0044
Pair: (PG, V), P-value: 0.0138
Pair: (CSCO, JPM), P-value: 0.0270
Pair: (AXP, TRV), P-value: 0.0304
Pair: (PG, WMT), P-value: 0.0307
Pair: (BA, JNJ), P-value: 0.0390
Pair: (MSFT, WMT), P-value: 0.0398
Pair: (HD, MSFT), P-value: 0.0401
Pair: (AMGN, V), P-value: 0.0414
Pair: (AAPL, WMT), P-value: 0.0424
Pair: (MSFT, V), P-value: 0.0456
Pair: (HD, WMT), P-value: 0.0497
