In [2]:
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime
import time
import os

In [3]:
os.makedirs('outputs', exist_ok=True)

stocks = ['AMD', 'NVDA', 'META', 'TSLA']
start = datetime(2015, 1, 1)
end = datetime(2018, 12, 31)

data = yf.download(stocks, start=start, end=end, auto_adjust=False)

data.head()

[*********************100%***********************]  4 of 4 completed


Price,Adj Close,Adj Close,Adj Close,Adj Close,Close,Close,Close,Close,High,High,...,Low,Low,Open,Open,Open,Open,Volume,Volume,Volume,Volume
Ticker,AMD,META,NVDA,TSLA,AMD,META,NVDA,TSLA,AMD,META,...,NVDA,TSLA,AMD,META,NVDA,TSLA,AMD,META,NVDA,TSLA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-02,2.67,78.082001,0.483099,14.620667,2.67,78.449997,0.50325,14.620667,2.67,78.93,...,0.49525,14.217333,2.67,78.580002,0.50325,14.858,0,18177500,113680000,71466000
2015-01-05,2.66,76.827904,0.474939,14.006,2.66,77.190002,0.49475,14.006,2.7,79.25,...,0.4925,13.810667,2.67,77.980003,0.50325,14.303333,8878200,26452200,197952000,80527500
2015-01-06,2.63,75.792786,0.46054,14.085333,2.63,76.150002,0.47975,14.085333,2.66,77.589996,...,0.47925,13.614,2.65,77.230003,0.4955,14.004,13912500,27399300,197764000,93928500
2015-01-07,2.58,75.792786,0.45934,14.063333,2.58,76.150002,0.4785,14.063333,2.65,77.360001,...,0.477,13.985333,2.63,76.760002,0.48325,14.223333,12377600,22045300,321808000,44526000
2015-01-08,2.61,77.813271,0.476619,14.041333,2.61,78.18,0.4965,14.041333,2.65,78.230003,...,0.48375,14.000667,2.59,76.739998,0.484,14.187333,11136600,23961000,283780000,51637500


In [4]:
stock_prices = data.resample('ME').last()
stock_prices.index = stock_prices.index.tz_localize(None)

In [5]:
print(data.columns)

MultiIndex([('Adj Close',  'AMD'),
            ('Adj Close', 'META'),
            ('Adj Close', 'NVDA'),
            ('Adj Close', 'TSLA'),
            (    'Close',  'AMD'),
            (    'Close', 'META'),
            (    'Close', 'NVDA'),
            (    'Close', 'TSLA'),
            (     'High',  'AMD'),
            (     'High', 'META'),
            (     'High', 'NVDA'),
            (     'High', 'TSLA'),
            (      'Low',  'AMD'),
            (      'Low', 'META'),
            (      'Low', 'NVDA'),
            (      'Low', 'TSLA'),
            (     'Open',  'AMD'),
            (     'Open', 'META'),
            (     'Open', 'NVDA'),
            (     'Open', 'TSLA'),
            (   'Volume',  'AMD'),
            (   'Volume', 'META'),
            (   'Volume', 'NVDA'),
            (   'Volume', 'TSLA')],
           names=['Price', 'Ticker'])


In [6]:
stock_prices = data.filter(like='Adj Close')

stock_prices.columns = stock_prices.columns.get_level_values(1)

print(stock_prices.head())

Ticker       AMD       META      NVDA       TSLA
Date                                            
2015-01-02  2.67  78.082001  0.483099  14.620667
2015-01-05  2.66  76.827904  0.474939  14.006000
2015-01-06  2.63  75.792786  0.460540  14.085333
2015-01-07  2.58  75.792786  0.459340  14.063333
2015-01-08  2.61  77.813271  0.476619  14.041333


In [7]:
print(stock_prices.columns)

Index(['AMD', 'META', 'NVDA', 'TSLA'], dtype='object', name='Ticker')


In [8]:
pairs = [('AMD', 'NVDA'), ('AMD', 'META'), ('NVDA', 'TSLA')]

spread_list = []

for ticker_1, ticker_2 in pairs:
    spread = np.log(stock_prices[ticker_1]) - np.log(stock_prices[ticker_2])
    spread_mean = spread.mean()
    spread_std = spread.std()
    z_spread = (spread - spread_mean) / spread_std

    pair_df = z_spread.reset_index()
    pair_df['Ticker Pair'] = f'{ticker_1}-{ticker_2}'
    pair_df.columns = ['Date', 'Spread', 'Ticker Pair']
    spread_list.append(pair_df)

spread_df = pd.concat(spread_list)

print(spread_df.head())

        Date    Spread Ticker Pair
0 2015-01-02  1.453322    AMD-NVDA
1 2015-01-05  1.495809    AMD-NVDA
2 2015-01-06  1.558008    AMD-NVDA
3 2015-01-07  1.504956    AMD-NVDA
4 2015-01-08  1.423816    AMD-NVDA


In [9]:
stock_return = stock_prices.pct_change()
return_spread_list = []

for ticker_1, ticker_2 in pairs:
    spread = stock_return[ticker_1] - stock_return[ticker_2]
    
    pair_df = spread.reset_index()
    pair_df['Ticker Pair'] = f'{ticker_1}-{ticker_2}'
    pair_df.columns = ['Date', 'Return', 'Ticker Pair']
    return_spread_list.append(pair_df)

return_spread_df = pd.concat(return_spread_list)

print(return_spread_df.head())

        Date    Return Ticker Pair
0 2015-01-02       NaN    AMD-NVDA
1 2015-01-05  0.013145    AMD-NVDA
2 2015-01-06  0.019040    AMD-NVDA
3 2015-01-07 -0.016406    AMD-NVDA
4 2015-01-08 -0.025990    AMD-NVDA


In [10]:
# Merge price spread (spread_df) and return spread (return_spread_df)
final_df = spread_df.merge(
    return_spread_df,
    how='inner',
    on=['Date', 'Ticker Pair'],
    validate='one_to_one'
)

print(final_df.head())

        Date    Spread Ticker Pair    Return
0 2015-01-02  1.453322    AMD-NVDA       NaN
1 2015-01-05  1.495809    AMD-NVDA  0.013145
2 2015-01-06  1.558008    AMD-NVDA  0.019040
3 2015-01-07  1.504956    AMD-NVDA -0.016406
4 2015-01-08  1.423816    AMD-NVDA -0.025990


In [11]:
print(final_df.head())

print("final_df already prepare！")

        Date    Spread Ticker Pair    Return
0 2015-01-02  1.453322    AMD-NVDA       NaN
1 2015-01-05  1.495809    AMD-NVDA  0.013145
2 2015-01-06  1.558008    AMD-NVDA  0.019040
3 2015-01-07  1.504956    AMD-NVDA -0.016406
4 2015-01-08  1.423816    AMD-NVDA -0.025990
final_df already prepare！


# Method 1 - OLS

In [13]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import r2_score, mean_squared_error

df = pd.read_csv('outputs/spreads.csv')
df['Date'] = pd.to_datetime(df['Date'])

In [14]:
df.head()

Unnamed: 0,Date,Ticker Pair,Spread,Return
0,2019-02-28,AMD-NVDA,-1.319546,
1,2019-03-31,AMD-NVDA,-2.01662,-0.079436
2,2019-04-30,AMD-NVDA,-1.311965,0.074661
3,2019-05-31,AMD-NVDA,1.45613,0.242785
4,2019-06-30,AMD-NVDA,0.568124,-0.104398


In [15]:
train_data = final_df.dropna(subset=['Spread', 'Return'])

X_train = train_data[['Spread']]   
y_train = train_data['Return']      

X_predict = df[['Spread']]         

In [16]:
from sklearn.linear_model import LinearRegression

ols_model = LinearRegression()

ols_model.fit(X_train, y_train)

In [17]:
y_pred = ols_model.predict(X_predict)

df['Predicted Return'] = y_pred

print(df[['Date', 'Ticker Pair', 'Spread', 'Predicted Return']].head())

        Date Ticker Pair    Spread  Predicted Return
0 2019-02-28    AMD-NVDA -1.319546         -0.000532
1 2019-03-31    AMD-NVDA -2.016620         -0.001516
2 2019-04-30    AMD-NVDA -1.311965         -0.000522
3 2019-05-31    AMD-NVDA  1.456130          0.003385
4 2019-06-30    AMD-NVDA  0.568124          0.002132


In [18]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

X = X_train.reset_index(drop=True)
y = y_train.reset_index(drop=True)

tscv = TimeSeriesSplit(n_splits=5)

y_true_all = []
y_pred_all = []

for train_idx, test_idx in tscv.split(X):
    X_train_cv, X_test_cv = X.iloc[train_idx], X.iloc[test_idx]
    y_train_cv, y_test_cv = y.iloc[train_idx], y.iloc[test_idx]

    model = LinearRegression()
    model.fit(X_train_cv, y_train_cv)

    y_pred_cv = model.predict(X_test_cv)

    y_true_all.extend(y_test_cv)
    y_pred_all.extend(y_pred_cv)

y_true_all = np.array(y_true_all)
y_pred_all = np.array(y_pred_all)

r2 = r2_score(y_true_all, y_pred_all)
mse = mean_squared_error(y_true_all, y_pred_all)
returns = y_pred_all
sharpe_ratio = np.mean(returns) / np.std(returns)

print(f"Cross-Validated R²: {r2:.4f}")
print(f"Cross-Validated MSE: {mse:.6f}")
print(f"Cross-Validated Sharpe Ratio: {sharpe_ratio:.4f}")

Cross-Validated R²: -0.0083
Cross-Validated MSE: 0.001393
Cross-Validated Sharpe Ratio: 0.0649


In [19]:
submission_df = df.copy()

submission_df = submission_df.rename(columns={'Predicted Return': 'Traditional Spread'})

submission_df['Traditional Position'] = submission_df['Traditional Spread'].apply(lambda x: 'Sell' if x > 0 else 'Buy')

submission_df = submission_df[['Date', 'Ticker Pair', 'Traditional Spread', 'Traditional Position']]

os.makedirs('outputs', exist_ok=True)
submission_df.to_csv('outputs/traditional_dummy_1.csv', index=False)

print("outputs/traditional_dummy_1.csv saved!")

outputs/traditional_dummy_1.csv saved!


# Method 2 - Mean Reversion

In [21]:
print(final_df.head())
print(df.head())

        Date    Spread Ticker Pair    Return
0 2015-01-02  1.453322    AMD-NVDA       NaN
1 2015-01-05  1.495809    AMD-NVDA  0.013145
2 2015-01-06  1.558008    AMD-NVDA  0.019040
3 2015-01-07  1.504956    AMD-NVDA -0.016406
4 2015-01-08  1.423816    AMD-NVDA -0.025990
        Date Ticker Pair    Spread    Return  Predicted Return
0 2019-02-28    AMD-NVDA -1.319546       NaN         -0.000532
1 2019-03-31    AMD-NVDA -2.016620 -0.079436         -0.001516
2 2019-04-30    AMD-NVDA -1.311965  0.074661         -0.000522
3 2019-05-31    AMD-NVDA  1.456130  0.242785          0.003385
4 2019-06-30    AMD-NVDA  0.568124 -0.104398          0.002132


In [22]:
import pandas as pd

df = pd.read_csv('outputs/spreads.csv')

df['Date'] = pd.to_datetime(df['Date'])

df.head()

Unnamed: 0,Date,Ticker Pair,Spread,Return
0,2019-02-28,AMD-NVDA,-1.319546,
1,2019-03-31,AMD-NVDA,-2.01662,-0.079436
2,2019-04-30,AMD-NVDA,-1.311965,0.074661
3,2019-05-31,AMD-NVDA,1.45613,0.242785
4,2019-06-30,AMD-NVDA,0.568124,-0.104398


In [23]:
mean_spread = final_df['Spread'].mean()

print(f"Long-term mean spread (2015-2018): {mean_spread:.4f}")

Long-term mean spread (2015-2018): -0.0000


In [24]:
df['Traditional Spread'] = df['Spread']

df['Traditional Position'] = df['Traditional Spread'].apply(
    lambda x: 'Sell' if x > mean_spread else 'Buy'
)

print(df[['Date', 'Ticker Pair', 'Traditional Spread', 'Traditional Position']].head())

        Date Ticker Pair  Traditional Spread Traditional Position
0 2019-02-28    AMD-NVDA           -1.319546                  Buy
1 2019-03-31    AMD-NVDA           -2.016620                  Buy
2 2019-04-30    AMD-NVDA           -1.311965                  Buy
3 2019-05-31    AMD-NVDA            1.456130                 Sell
4 2019-06-30    AMD-NVDA            0.568124                 Sell


In [25]:
submission_df = df[['Date', 'Ticker Pair', 'Traditional Spread', 'Traditional Position']]

import os
os.makedirs('outputs', exist_ok=True)
submission_df.to_csv('outputs/traditional_dummy_2.csv', index=False)

print("outputs/traditional_dummy_2.csv saved successfully!")

outputs/traditional_dummy_2.csv saved successfully!


# Method 3 - Ridge Regression

In [27]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score, mean_squared_error

df = pd.read_csv('outputs/spreads.csv')
df['Date'] = pd.to_datetime(df['Date'])

df.head()

Unnamed: 0,Date,Ticker Pair,Spread,Return
0,2019-02-28,AMD-NVDA,-1.319546,
1,2019-03-31,AMD-NVDA,-2.01662,-0.079436
2,2019-04-30,AMD-NVDA,-1.311965,0.074661
3,2019-05-31,AMD-NVDA,1.45613,0.242785
4,2019-06-30,AMD-NVDA,0.568124,-0.104398


In [28]:
final_df = final_df.sort_values('Date').reset_index(drop=True)

final_df['Spread_lag1'] = final_df['Spread'].shift(1)

final_df = final_df.dropna().reset_index(drop=True)

X_train = final_df[['Spread_lag1']]  
y_train = final_df['Spread']         

print(X_train.head())
print(y_train.head())

   Spread_lag1
0    -1.515811
1     1.495809
2    -0.681149
3    -1.483302
4     1.558008
0    1.495809
1   -0.681149
2   -1.483302
3    1.558008
4   -0.677588
Name: Spread, dtype: float64


In [29]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict

ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

kf = KFold(n_splits=5, shuffle=True, random_state=42)

r2_scores = cross_val_score(ridge_pipeline, X_train, y_train, cv=kf, scoring='r2')
mean_r2 = np.mean(r2_scores)

mse_scores = cross_val_score(ridge_pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
mean_mse = -np.mean(mse_scores)

y_pred_cv = cross_val_predict(ridge_pipeline, X_train, y_train, cv=kf)

pseudo_returns = y_train - y_pred_cv

mean_return = np.mean(pseudo_returns)
std_return = np.std(pseudo_returns)
sharpe_ratio = mean_return / std_return

print(f"Cross-Validated R²: {mean_r2:.4f}")
print(f"Cross-Validated MSE: {mean_mse:.6f}")
print(f"Cross-Validated Sharpe Ratio: {sharpe_ratio:.4f}")

Cross-Validated R²: 0.0361
Cross-Validated MSE: 0.961688
Cross-Validated Sharpe Ratio: 0.0002


In [30]:
ridge_pipeline.fit(X_train, y_train)

df = df.sort_values('Date').reset_index(drop=True)
df['Spread_lag1'] = df['Spread'].shift(1)

df = df.dropna().reset_index(drop=True)

X_pred = df[['Spread_lag1']]
df['Spread'] = ridge_pipeline.predict(X_pred)

In [31]:
df = df.rename(columns={'Spread': 'Traditional Spread'})

df['Traditional Position'] = df['Traditional Spread'].apply(lambda x: 'Sell' if x > 0 else 'Buy')

df[['Date', 'Ticker Pair', 'Traditional Spread', 'Traditional Position']].to_csv('outputs/traditional_dummy_3.csv', index=False)

print(" traditional_dummy_3.csv saved！")

 traditional_dummy_3.csv saved！
