QuantSC Capstone Project

In [None]:
# 1. Imports & Setup
import pandas as pd
import numpy as np
# Install yfinance if not already installed
%pip install yfinance

import yfinance as yf
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

Note: you may need to restart the kernel to use updated packages.


In [None]:
# 2. Load Stock and Market Data
def fetch_stock_data(ticker, start, end):
    df = yf.download(ticker, start=start, end=end)
    df['log_return'] = np.log(df['Adj Close'] / df['Adj Close'].shift(1))
    df = df[['Adj Close', 'log_return']].dropna()
    return df

stock = fetch_stock_data("PFE", "2010-01-01", "2022-12-31")
market = fetch_stock_data("^GSPC", "2010-01-01", "2022-12-31")


In [None]:
# 3. CAPM Market Adjustment
def compute_beta(stock_ret, market_ret):
    aligned = pd.concat([stock_ret, market_ret], axis=1).dropna()
    X = sm.add_constant(aligned.iloc[:, 1])
    y = aligned.iloc[:, 0]
    model = sm.OLS(y, X).fit()
    return model.params[1]

def compute_market_adjusted_return(stock_ret, market_ret, beta):
    return stock_ret - beta * market_ret

beta = compute_beta(stock['log_return'], market['log_return'])
stock['adj_return'] = compute_market_adjusted_return(stock['log_return'], market['log_return'], beta)


In [None]:
# 4. Compute Monthly Drift and Volatility
stock['year_month'] = stock.index.to_period('M')
monthly_stats = stock.groupby('year_month')['adj_return'].agg(['mean', 'std'])
monthly_stats.columns = ['drift', 'volatility']
monthly_stats.index = monthly_stats.index.to_timestamp()


In [None]:
# 5. Load Patent Data
patents = pd.read_csv("biopharma_patent_assignments.csv", parse_dates=['date'])
patents['month'] = patents['date'].dt.to_period('M')
monthly_patents = patents.groupby('month')[['num_patents', 'num_categories', 'num_new_categories']].sum()
monthly_patents.index = monthly_patents.index.to_timestamp()


In [None]:
# 6. Merge Datasets
df = monthly_stats.join(monthly_patents, how='inner').dropna()
features = ['num_patents', 'num_categories', 'num_new_categories']
target_drift = df['drift']
target_volatility = df['volatility']


In [None]:
# 7. Lagged Regression with OLS
def create_lagged_features(df, features, max_lag):
    df_lagged = pd.DataFrame(index=df.index)
    for feature in features:
        for lag in range(max_lag + 1):
            df_lagged[f'{feature}_lag{lag}'] = df[feature].shift(lag)
    return df_lagged.dropna()

X_lagged = create_lagged_features(df, features, max_lag=3)
y_drift = target_drift.loc[X_lagged.index]
y_vol = target_volatility.loc[X_lagged.index]

X_lagged_const = sm.add_constant(X_lagged)
drift_model = sm.OLS(y_drift, X_lagged_const).fit()
vol_model = sm.OLS(y_vol, X_lagged_const).fit()

print(drift_model.summary())
print(vol_model.summary())


In [None]:
# 8. Lasso Regression (optional for feature selection)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_lagged)

lasso_drift = LassoCV(cv=5).fit(X_scaled, y_drift)
lasso_vol = LassoCV(cv=5).fit(X_scaled, y_vol)

print("Lasso - Drift Coefficients:", lasso_drift.coef_)
print("Lasso - Volatility Coefficients:", lasso_vol.coef_)


In [None]:
# 9. Visualization
fig, axs = plt.subplots(2, 1, figsize=(12, 6))
axs[0].plot(df.index, df['drift'], label='Drift')
axs[0].set_title("Monthly Drift")
axs[1].plot(df.index, df['volatility'], label='Volatility', color='orange')
axs[1].set_title("Monthly Volatility")
plt.tight_layout()
plt.show()
