In [83]:
from pandas_datareader import data as pdr
import numpy as np
from numpy import arange
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV as rcv
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
from IPython import get_ipython
import datetime
import yfinance as yf
import csv

In [84]:
df = pdr.DataReader('ETH-USD',data_source='yahoo' ,start='2020-01-01',end=datetime.date.today())
df = df[['Open']]#,'High','Low','Close','Volume']] 

# SECURITY FEATURES
security_features = ['SPY','VXX','BTC-USD','QQQ','XLP','TSLA','GLD','TBT','DOGE-USD','LTC-USD','XRP-USD','ADA-USD','LTPZ',]
for val in security_features:
    df = df.join(pdr.DataReader(val,data_source='yahoo',start='2020-01-01',end=datetime.date.today())[['Close','Volume']].rename(columns={"Close":val+"Close", "Volume":val+"Volume"}))
df = df[:-1]

# FRED FEATURES
fred_features = ['GS10', 'CPIAUCSL']
for val in fred_features:
    df = df.join(pdr.get_data_fred(val,start='2020-01-01'))

df

Unnamed: 0_level_0,Open,SPYClose,SPYVolume,VXXClose,VXXVolume,BTC-USDClose,BTC-USDVolume,QQQClose,QQQVolume,XLPClose,...,LTC-USDClose,LTC-USDVolume,XRP-USDClose,XRP-USDVolume,ADA-USDClose,ADA-USDVolume,LTPZClose,LTPZVolume,GS10,CPIAUCSL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-01,129.630661,,,,,7200.174316,18565664997,,,,...,42.017719,2782992323,0.192667,1041134003,0.033458,22948374,,,1.76,258.687
2020-01-02,130.820038,324.869995,59151200.0,58.040001,7078100.0,6985.470215,20802083465,216.160004,30969400.0,62.480000,...,39.823013,2759827139,0.188043,1085351426,0.032751,20843934,73.199997,136900.0,,
2020-01-03,127.411263,322.410004,77709700.0,61.160000,12039150.0,7344.884277,28111481032,214.179993,27518900.0,62.380001,...,42.415573,3260961326,0.193521,1270017043,0.034180,30162644,74.070000,113200.0,,
2020-01-04,134.168518,,,,,7410.656738,18444271275,,,,...,43.326607,2843192897,0.194355,999331594,0.034595,29535781,,,,
2020-01-05,135.072098,,,,,7411.317383,19725074095,,,,...,43.553207,3017148033,0.195537,1168067557,0.034721,21479178,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-01-09,3091.696289,,,,,41911.601562,21294384372,,,,...,131.298325,710682825,0.754505,1182503166,1.173860,1088569809,,,,
2022-01-10,3157.570557,465.510010,119362000.0,18.469999,92378700.0,41821.261719,32104232331,380.109985,91536500.0,76.809998,...,126.833824,1097888042,0.739335,2014043704,1.128052,1481621292,85.709999,255600.0,,
2022-01-11,3082.990967,469.750000,74303100.0,17.820000,52551300.0,42735.855469,26327648900,385.820007,68295700.0,76.720001,...,131.544983,834900749,0.770554,1897366203,1.185110,1089929557,87.180000,343600.0,,
2022-01-12,3238.449951,471.019989,67605400.0,17.629999,43270200.0,43949.101562,33499938689,387.350006,54576400.0,76.750000,...,141.737198,881585292,0.799548,1976047994,1.304535,1651269250,85.639999,348400.0,,


In [85]:
# Fills in NaN values for LASSO regression
df = df.fillna(method ='ffill')[1:]
df.isnull().sum(axis = 0)
df = df.values

X, y = df[:, 1:], df[:, 0]

# define model evaluation method via repeated K-Fold Cross Validation
cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 1)

Below we try both Ridge and Lasso regressions in order to make our predictions. Both estimators shrink the estimator to zero, however, Ridge penalizes large squared values of the coefficients. Then we go ahead and minimize the penalized sum of squared residuals:

$$\min\{\sum_{i=1}^n(Y_i - X_i^{'}b)^2 + \lambda_{Ridge}\sum_{j=1}^{k}b_j^2\}$$

Similarly, Lasso shrinks the estimate towards zero by penalizing absolute values of the coefficients as shown below:

$$\min\{\sum_{i=1}^n(Y_i - X_i^{'}b)^2 + \lambda_{Lasso}\sum_{j=1}^{k}|b_j|\}$$

In both cases we $10$-fold cross validation in order to determine the $\lambda_{Ridge}$ and $\lambda_{Lasso}$ which yield the lowest MSPE$(\lambda_{Ridge})$ and MSPE$(\lambda_{Lasso})$ respectively.

It is worth noting that Lasso looks very similar to Ridge, but turns out to have one very special property: Lasso
tends to set many the $\hat{\beta}$’s exactly to $0$. Ridge shrinks them relative to OLS, but doesn’t shrink
them all the way to zero.

This feature means that Lasso can work especially well when in reality many of the predictors are irrelevant (but we don’t know which ones). This indicates a reason that Lasso does in fact provides better estimates in our goal of estimating ETH-USD's open price.

In [86]:
# define model
model = LassoCV(alphas =arange(0.01, 20, 0.01), cv = cv, n_jobs = -1, tol = 1)
model2 = make_pipeline(StandardScaler(with_mean=False), RidgeCV(alphas =arange(0.01, 20, 0.01), cv = cv))

# fit model
model.fit(X, y)
model2.fit(X, y)

# summarize chosen configuration
print('alpha: %f' % model.alpha_)
# Not sure how to print this alpha print('alpha: %f' % model2.alpha_)

alpha: 0.070000


In [87]:
# Price on 2022-01-13 LASSO CV
model.predict([df[-1][1:]])


array([3736.66189362])

In [88]:
# Price on 2022-01-13 RIDGE CV
model2.predict([df[-1][1:]])

array([3772.4647807])

In [89]:
steps = [('scaler',StandardScaler()),
         ('lasso',Lasso())]        

pipeline = Pipeline(steps)


parameters = {'lasso__alpha':np.arange(0.0001,10,.0001),
              'lasso__max_iter':np.random.uniform(100,100000,4)}


reg = rcv(pipeline, parameters,cv=5)