# Multivariate Regression #

In [2]:
# Import Libraries

# Data Management
import pandas as pd
import numpy as np

# Visuals
import matplotlib.pyplot as plt

# Statistics
import statsmodels.api as sm 
from scipy.stats import t

# Handle Files
import sys
import os

# Import Local Functions
sys.path.append(os.path.abspath("../source"))
from config import get_tickers
from data_downloader import get_market_data

In [4]:
tickers = get_tickers(mod="3.4")

tickers

['AAPL', 'MSFT', 'WMT']

In [12]:
# Import data
data_regression = pd.DataFrame()

for ticker in tickers:
    df = get_market_data(
        ticker=ticker, 
        start_date='2010-01-01', 
        end_date='2025-01-01', 
        returns=True
    )
    
    returns = df['returns'].rename(ticker)
    
    data_regression = pd.concat([data_regression, returns], axis=1)
    
    print(f'Data Ready for {ticker}')

Data Ready for AAPL
Data Ready for MSFT
Data Ready for WMT


In [13]:
data_regression

Unnamed: 0,AAPL,MSFT,WMT
2010-01-05 00:00:00,0.001727,0.000323,-0.010008
2010-01-06 00:00:00,-0.016034,-0.006156,-0.002238
2010-01-07 00:00:00,-0.001850,-0.010454,0.000560
2010-01-08 00:00:00,0.006626,0.006873,-0.005050
2010-01-11 00:00:00,-0.008860,-0.012802,0.016367
...,...,...,...
2024-12-24 00:00:00,0.011413,0.009330,0.025462
2024-12-26 00:00:00,0.003171,-0.002781,0.001186
2024-12-27 00:00:00,-0.013331,-0.017453,-0.012253
2024-12-30 00:00:00,-0.013352,-0.013328,-0.011963


In [14]:
# X Variables
benchmark = pd.read_csv("../additional_data/sp500.csv")
benchmark.set_index("date", inplace=True)
benchmark.index = pd.to_datetime(benchmark.index)

benchmark

Unnamed: 0_level_0,sp_500
date,Unnamed: 1_level_1
2010-01-05,0.003111
2010-01-06,0.000545
2010-01-07,0.003993
2010-01-08,0.002878
2010-01-11,0.001745
...,...
2024-12-24,0.010982
2024-12-26,-0.000406
2024-12-27,-0.011117
2024-12-30,-0.010760


In [16]:
# Y Matrix
Y_Matrix = data_regression

# X Matrix
Information_Matrix = sm.add_constant(benchmark['sp_500'])

In [159]:
def LinearRegressionCoefficients(
    y_matrix: pd.DataFrame, 
    information_matrix:pd.DataFrame
):
    # Check if both arrays have the same rows
    if information_matrix.shape[0] != y_matrix.shape[0]:
        raise ValueError("The rows are not coincident.")
    
    # Set the components
    X = information_matrix
    Y = y_matrix
    
    # Calculate the interaction arrays
    X_T = X.T
    XTX_inv = np.linalg.inv(X_T.dot(X))
    XTY = X_T.dot(Y)
    
    # Coefficients
    coef = XTX_inv.dot(XTY)
    
    # Fitted values and residuals
    fitted = X.dot(coef)
    residuals = Y.to_numpy() - fitted
    
    # Sigmas
    stds = residuals.std(axis=0, ddof=1).to_numpy()
    
    # Output Series
    alphas = pd.Series(coef[0], index=Y.columns, name='alpha')
    betas = pd.Series(coef[1], index=Y.columns, name='beta')
    sigmas = pd.Series(stds, index=Y.columns, name='sigma')
    
    return alphas, betas, sigmas


In [160]:
# Calculate the betas
alphas, betas, sigmas = LinearRegressionCoefficients(Y_Matrix, Information_Matrix)

coefficients = pd.DataFrame(
    {'alpha': alphas, 'beta': betas, 'sigma': sigmas},
)

coefficients

Unnamed: 0,alpha,beta,sigma
AAPL,0.000488,1.104945,0.012789
MSFT,0.000278,1.120878,0.010518
WMT,0.000297,0.492494,0.01083


In [161]:
# Computationally, this method is more efficient, but the results are the same:
model = sm.OLS(Y_Matrix['WMT'], Information_Matrix)
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                    WMT   R-squared:                       0.197
Model:                            OLS   Adj. R-squared:                  0.197
Method:                 Least Squares   F-statistic:                     924.6
Date:                Thu, 10 Jul 2025   Prob (F-statistic):          8.11e-182
Time:                        18:53:42   Log-Likelihood:                 11721.
No. Observations:                3773   AIC:                        -2.344e+04
Df Residuals:                    3771   BIC:                        -2.343e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0003      0.000      1.684      0.0

In [172]:
# Now how can we make this rolling
def RollingRegressionCoefficients(
    y_matrix: pd.DataFrame, 
    x_matrix: pd.Series,
    window: int = 252
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    
    # Lists to store results
    alphas_list = []
    betas_list = []
    sigmas_list = []
    dates = []

    # Rolling window
    for i in range(window, len(y_matrix)):
        X_window = x_matrix[i - window:i]  # (252, 2)
        Y_window = y_matrix.iloc[i - window:i]  # (252, n_assets)

        # Calculate the coefficients
        try:
            alpha, beta, sigma = LinearRegressionCoefficients(
                Y_window,
                pd.DataFrame(X_window, index=Y_window.index)
            )
            alphas_list.append(alpha)
            betas_list.append(beta)
            sigmas_list.append(sigma)
            dates.append(y_matrix.index[i])

        except np.linalg.LinAlgError:
            # Singular matrix, skip this window
            continue

    # Create DataFrames
    alphas_df = pd.DataFrame(alphas_list, index=dates)
    betas_df = pd.DataFrame(betas_list, index=dates)
    sigmas_df = pd.DataFrame(sigmas_list, index=dates)

    return alphas_df, betas_df, sigmas_df

In [173]:
alphas_series, betas_series, sigmas_series = RollingRegressionCoefficients(
    Y_Matrix, 
    Information_Matrix,
)

In [169]:
alphas_series

Unnamed: 0,AAPL,MSFT,WMT
2011-01-04,0.001229,-0.000725,-0.000064
2011-01-05,0.001262,-0.000695,-0.000003
2011-01-06,0.001339,-0.000699,-0.000026
2011-01-07,0.001368,-0.000521,-0.000052
2011-01-10,0.001390,-0.000562,-0.000016
...,...,...,...
2024-12-24,0.000225,-0.000448,0.002028
2024-12-26,0.000256,-0.000464,0.002060
2024-12-27,0.000296,-0.000455,0.002075
2024-12-30,0.000287,-0.000462,0.001996


In [170]:
betas_series

Unnamed: 0,AAPL,MSFT,WMT
2011-01-04,1.055223,0.886700,0.389252
2011-01-05,1.055162,0.886525,0.389933
2011-01-06,1.055436,0.885569,0.388743
2011-01-07,1.056242,0.884544,0.389438
2011-01-10,1.055534,0.884560,0.389694
...,...,...,...
2024-12-24,0.953510,1.188254,0.265389
2024-12-26,0.954301,1.186106,0.278078
2024-12-27,0.955516,1.187171,0.279143
2024-12-30,0.957832,1.190148,0.287412


In [171]:
sigmas_series

Unnamed: 0,AAPL,MSFT,WMT
2011-01-04,0.011823,0.009452,0.007594
2011-01-05,0.011827,0.009458,0.007566
2011-01-06,0.011773,0.009461,0.007583
2011-01-07,0.011764,0.009629,0.007598
2011-01-10,0.011773,0.009630,0.007590
...,...,...,...
2024-12-24,0.012006,0.008334,0.010682
2024-12-26,0.011997,0.008336,0.010743
2024-12-27,0.011991,0.008332,0.010739
2024-12-30,0.011992,0.008334,0.010753
