# Method - OLS

# Independent variable - Return
## Spreadt​=β0​+β1​⋅Returnt​+ϵt​

In [6]:
import pandas as pd
import statsmodels.api as sm

train_data = pd.read_csv('outputs/spreads_weekly_large.csv')

test_data = pd.read_csv('outputs/spreads_testing.csv')

train_data.head()

Unnamed: 0,Date,Ticker Pair,Spread,Return
0,2016-01-10,AAPL-MSFT,-0.544055,
1,2016-01-17,AAPL-MSFT,-0.165858,0.02736
2,2016-01-24,AAPL-MSFT,0.080579,0.018672
3,2016-01-31,AAPL-MSFT,-1.192595,-0.093776
4,2016-02-07,AAPL-MSFT,-0.312257,0.060624


In [7]:
train_data.columns

Index(['Date', 'Ticker Pair', 'Spread', 'Return'], dtype='object')

In [8]:
train_data['Year'] = train_data['Date'].str[:4]
print(train_data['Year'].unique())

['2016' '2017' '2018']


In [12]:
results = []

for pair in train_data["Ticker Pair"].unique():
    train = train_data[train_data["Ticker Pair"] == pair].copy()
    test = test_data[test_data["Ticker Pair"] == pair].copy()

    train = train.dropna(subset=["Return", "Spread"])
    test = test.dropna(subset=["Return"])

    if len(train) < 10 or len(test) < 1:
        continue

    results.append((pair, train, test))

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import numpy as np

final_predictions = []
threshold = 0  

for pair, train, test in results:
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("regressor", LinearRegression())
    ])

    X_train = train[["Return"]]
    y_train = train["Spread"]
    pipeline.fit(X_train, y_train)

    X_test = test[["Return"]]
    test["Traditional_Spread"] = pipeline.predict(X_test)

    test["Ticker Pair"] = pair

    final_predictions.append(test[["Date", "Ticker Pair", "Traditional_Spread"]])

In [17]:
all_predictions = pd.concat(final_predictions, ignore_index=True)

all_predictions.to_csv("outputs/Traditional Spreads weekly Return.csv", index=False)

print(all_predictions.head())

         Date Ticker Pair  Traditional_Spread
0  2019-01-13   AAPL-MSFT            0.176369
1  2019-01-20   AAPL-MSFT           -0.147277
2  2019-01-27   AAPL-MSFT            0.108935
3  2019-02-03   AAPL-MSFT            0.863532
4  2019-02-10   AAPL-MSFT            0.008510


In [20]:
df = pd.read_csv("outputs/Traditional Spreads weekly Return.csv")

print("rows：", df.shape[0])
print("columns：", df.shape[1])

rows： 510
columns： 3


# Independent variable - Spread(t-1)
## Spreadt​=β0​+β1​⋅Spread(t−1)​+ϵt

In [22]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

train_df = pd.read_csv("outputs/spreads_weekly_large.csv", parse_dates=["Date"])
test_df = pd.read_csv("outputs/spreads_testing.csv", parse_dates=["Date"])

In [24]:
pairs = train_df["Ticker Pair"].unique()

results = []

for pair in pairs:
    train = train_df[train_df["Ticker Pair"] == pair].copy()
    test = test_df[test_df["Ticker Pair"] == pair].copy()

    train["Spread_lag1"] = train["Spread"].shift(1)
    train.dropna(inplace=True)  

    combined = pd.concat([train.tail(1)[["Spread"]], test[["Spread"]]], ignore_index=True)
    test_clean = test[["Date", "Ticker Pair"]].copy()
    test_clean["Spread_lag1"] = combined["Spread"].shift(1).iloc[1:].values

    results.append((pair, train, test_clean))

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import numpy as np

final_predictions = []

threshold = 1

for pair, train, test_clean in results:
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("regressor", LinearRegression())
    ])

    X_train = train[["Spread_lag1"]]
    y_train = train["Spread"]
    pipeline.fit(X_train, y_train)

    X_test = test_clean[["Spread_lag1"]]
    test_clean["Predicted_Spread"] = pipeline.predict(X_test)

    final_predictions.append(test_clean)

In [28]:
all_predictions = pd.concat(final_predictions, ignore_index=True)

all_predictions.rename(columns={"Predicted_Spread": "Traditional_Spread"}, inplace=True)
all_predictions.drop(columns=["Spread_lag1"], inplace=True)

all_predictions.to_csv("outputs/Traditional Spreads weekly (t-1).csv", index=False)

print(all_predictions.head())

        Date Ticker Pair  Traditional_Spread
0 2019-01-06   AAPL-MSFT           -2.919420
1 2019-01-13   AAPL-MSFT           -1.101712
2 2019-01-20   AAPL-MSFT           -0.868813
3 2019-01-27   AAPL-MSFT           -1.089315
4 2019-02-03   AAPL-MSFT           -0.949445


In [30]:
df = pd.read_csv("outputs/Traditional Spreads weekly (t-1).csv")

print("rows：", df.shape[0])
print("columns：", df.shape[1])

rows： 520
columns： 3


# Note
Although I used both return and T-1 models to predict spreads, we ultimately decided to adopt the return-based model's predictions. This is because the return model dynamically captures market characteristics of spread fluctuations, incorporates multi-factor information, and reflects real-time market changes, significantly improving prediction accuracy and adaptability. In contrast, the T-1 model merely lags historical spread data—it cannot adapt to market shifts, ignores core spread dynamics, and essentially just repeats past data rather than providing meaningful forecasts. Thus, we chose the more predictive return-based approach.