In [1]:
import pandas as pd 
import numpy as np
import warnings 

warnings.filterwarnings('ignore')

from talib import RSI, BBANDS, MACD, ATR

### Loading and processing data

In [2]:
START = '2013-01-01'
END = '2017-12-31'

idx = pd.IndexSlice
DATA_STORE = 'assets.h5'

with pd.HDFStore(DATA_STORE) as store: 

    # Loading our prices from the data store and doing some manipulations on it
    prices = store['quandl/wiki/prices'].loc[idx[START:END, :], ['adj_open', 'adj_close', 'adj_low', 'adj_high', 'adj_volume']].rename(columns = lambda x: x.replace('adj_', "")).swaplevel().sort_index()
    stocks = store['us_equities/stocks'].loc[:, ['marketcap', 'ipoyear', 'sector']]

In [3]:
# Removing tickers that do not have more than 2 years worth of data 
MONTH = 21 
YEAR = 12 * MONTH 
min_observations = 2 * YEAR

n_obs = prices.groupby(level = 'ticker').size()
keep = n_obs[n_obs >= min_observations].index
prices = prices.loc[idx[keep, :], :]

In [4]:
# Cleaning up sector names and making sure that we have both price and sector information
stocks = stocks[~stocks.index.duplicated() & stocks.sector.notnull()]
stocks.sector = stocks.sector.str.lower().str.replace(" ", "_")
stocks.index.name = 'ticker'

shared = prices.index.get_level_values('ticker').unique().intersection(stocks.index)
stocks = stocks.loc[shared, :]
prices = prices.loc[idx[shared, :], :]

In [5]:
prices['dollar_vol'] = prices.loc[:, 'close'].mul(prices.loc[:, 'volume'], axis = 0)

# Computing a 21-day rolling average of the Dollar Volume for that day and ranking them across date
prices['dollar_vol'] = prices.groupby('ticker').dollar_vol.rolling(window = 21).mean().reset_index(level = 0, drop = True)
prices['dollar_vol_rank'] = prices.groupby('date').dollar_vol.rank(ascending=False)

### Creating the features for the model

In [6]:
prices['rsi'] = prices.groupby(level = 'ticker').close.apply(RSI)

In [7]:
def compute_bb(close): 
    high, _, low = BBANDS(close)

    return pd.DataFrame({'bb_high': high, 'bb_low': low}, index = close.index)

def compute_atr(stock_data): 

    df = ATR(stock_data.high, stock_data.low, stock_data.close, timeperiod=14)
    return df.sub(df.mean()).div(df.std())

def compute_macd(close): 

    macd = MACD(close)[0]

    return (macd - np.mean(macd))/(np.std(macd))


In [8]:
prices = prices.join(prices.groupby(level = 'ticker').close.apply(compute_bb))

In [9]:
prices['bb_high'] = prices.bb_high.sub(prices.close).div(prices.bb_high).apply(np.log1p)
prices['bb_low'] = prices.close.sub(prices.bb_low).div(prices.close).apply(np.log1p)

In [10]:
prices['atr'] = prices.groupby('ticker', group_keys=False).apply(compute_atr)

In [11]:
prices['macd'] = prices.groupby('ticker', group_keys=False).close.apply(lambda x: MACD(x)[0])

In [12]:
# Adding lagged returns
q = 0.0001 
lags = [1, 5, 10, 21, 42, 63]

for lag in lags: 
    prices[f'return_{lag}d'] = prices.groupby(level = 'ticker').close.pct_change(lag).pipe(lambda x: x.clip(lower = x.quantile(q), upper = x.quantile(1-q))).add(1).pow(1/lag).sub(1)

In [13]:
for t in [1, 2, 3, 4, 5]: 
    for lag in [1, 5, 10, 21]: 
        prices[f'return_{lag}d_lag{t}'] = prices.groupby(level = 'ticker')[f'return_{lag}d'].shift(t * lag)

In [14]:
# Generating our targets
for t in [1, 5, 10, 21]: 
    prices[f'target_{t}d'] = prices.groupby(level = 'ticker')[f'return_{t}d'].shift(-t)

In [18]:
# Generating features around month and year
prices['year'] = prices.index.get_level_values('date').year 
prices['month'] = prices.index.get_level_values('date').month 

prices = prices.join(stocks[['sector']])

In [19]:
prices = pd.get_dummies(prices, columns = ['year', 'month', 'sector'], prefix = ['year', 'month', ''], prefix_sep=['_', "_", ""], drop_first=True)

In [21]:
prices.shape

(2904233, 68)

So far the features that we've created from these prices are: 
* 21-day moving average of the dollar volume (volume * close price)
* 14 period RSI 
* Bollinger bands feature where we take the difference between closing price and the bollinger bands at high and low 
* MACD 
* ATR - Normalized 
* Lagged historical returns at different lags and lengths
    * Did the geometric average of the returns over different lengths to make them comparable


In [36]:
data = prices[prices.dollar_vol_rank < 100]
data.dropna(inplace=True)

In [37]:
y = data.filter(like = 'target')
X = data.drop(y.columns, axis = 1)

In [38]:
# Doing a simple linear regression using statsmodels 
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant

In [39]:
target = 'target_5d'
model = OLS(endog = y[target], exog = add_constant(data = X))

trained_model = model.fit() 


In [40]:
trained_model.summary()

0,1,2,3
Dep. Variable:,target_5d,R-squared:,0.025
Model:,OLS,Adj. R-squared:,0.024
Method:,Least Squares,F-statistic:,44.15
Date:,"Sat, 22 Jul 2023",Prob (F-statistic):,0.0
Time:,17:38:52,Log-Likelihood:,378470.0
No. Observations:,109675,AIC:,-756800.0
Df Residuals:,109611,BIC:,-756200.0
Df Model:,63,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0005,0.000,-1.833,0.067,-0.001,3.68e-05
open,-7.303e-05,2.31e-05,-3.165,0.002,-0.000,-2.78e-05
close,-0.0001,2.55e-05,-4.385,0.000,-0.000,-6.18e-05
low,8.241e-05,2.38e-05,3.461,0.001,3.57e-05,0.000
high,0.0001,2.54e-05,3.997,0.000,5.18e-05,0.000
volume,-3.535e-12,1.86e-12,-1.901,0.057,-7.18e-12,1.1e-13
dollar_vol,2.34e-13,5.17e-14,4.523,0.000,1.33e-13,3.35e-13
dollar_vol_rank,2.196e-06,1.07e-06,2.044,0.041,9.04e-08,4.3e-06
rsi,-1.446e-05,4.02e-06,-3.593,0.000,-2.23e-05,-6.57e-06

0,1,2,3
Omnibus:,43800.026,Durbin-Watson:,0.437
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2390166.935
Skew:,-1.143,Prob(JB):,0.0
Kurtosis:,25.756,Cond. No.,139000000000000.0


### Doing Ridge Regression on this dataset

In [43]:
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

ridge_alphas = np.logspace(-4, 4, 9)
ridge_alphas = sorted(list(ridge_alphas) + list(ridge_alphas * 5))

train_period_length = 63 
test_period_length = 10

In [44]:
for alpha in ridge_alphas: 
    model = Ridge(alpha = alpha, fit_intercept=False, random_state=42)

    pipe = Pipeline([
        ('scaler', StandardScaler()), 
        ('model', model)
    ])

    for 

NameError: name 'R' is not defined