In [30]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.neural_network import MLPRegressor 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.compose import TransformedTargetRegressor, ColumnTransformer

In [31]:
ts = pd.read_csv('spy.csv', index_col = 'Date', parse_dates=True)
ts

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1993-02-08,45.125000,44.906250,44.968750,44.968750,596100.0,26.147602
1993-02-09,44.812500,44.562500,44.812500,44.656250,122100.0,25.965910
1993-02-10,44.750000,44.531250,44.656250,44.718750,379600.0,26.002239
1993-02-11,45.125000,44.781250,44.781250,44.937500,19500.0,26.129446
1993-02-12,44.875000,44.593750,44.875000,44.593750,42500.0,25.929565
...,...,...,...,...,...,...
2022-06-02,417.440002,407.040009,409.420013,417.390015,79609600.0,417.390015
2022-06-03,414.040009,409.510010,412.399994,410.540009,71830800.0,410.540009
2022-06-06,416.609985,410.549988,414.779999,411.790009,57508900.0,411.790009
2022-06-07,416.220001,407.609985,408.100006,415.739990,59053100.0,415.739990


In [32]:
 ts.drop(
        [
            "High", "Low", "Open",
            "Close", "Volume"
        ],
        axis=1, inplace=True
    )

In [33]:
lookback = 4
lookforward = 1
down = 0
up = 0
ticker = 'SP'
#create the shifted lookback columns
for i in range(0, lookback):
    ts[ticker + "Lookback%s" % str(i+1)] = ts["Adj Close"].shift(i+1)
for i in range(0, lookforward):
    ts[ticker + "Lookforward%s" % str(i+1)] = ts["Adj Close"].shift(-(i+1))
ts.dropna(inplace=True)
#adjust the values to be percentage returns
ts[ticker + "Lookback0"] = ts["Adj Close"].pct_change()*100.0
for i in range(0, lookback):
    ts[ticker + "Lookback%s" % str(i+1)] = ts[
        ticker + "Lookback%s" % str(i+1)
    ].pct_change()*100.0
for i in range(0, lookforward):
    ts[ticker + "Lookforward%s" % str(i+1)] = ts[
        ticker + "Lookforward%s" % str(i+1)
    ].pct_change()*100.0
ts.dropna(inplace=True)

#create the boolean columns of whether direction is up/zero or down.
down_cols = [
    ts[ticker + "Lookforward%s" % str(i+1)] > -down
    for i in range(0, lookforward)
]
up_cols = [
    ts[ticker + "Lookforward%s" % str(i+1)] > up
    for i in range(0, lookforward)
]

#ts["UpDown"] = down_tot & up_tot
ts[ticker + "UpDown"] = np.sign(ts[ticker + "Lookforward1"])
# Convert True/False into 1 and 0
ts[ticker + "UpDown"] = ts[ticker + "UpDown"].astype(int)
ts[ticker + "UpDown"].replace(to_replace=0, value=-1, inplace=True)


In [69]:
ts.drop(
        [
            "Adj Close",
        ],
        axis=1, inplace=True
    )

In [70]:
ts

Unnamed: 0_level_0,SPLookback1,SPLookback2,SPLookback3,SPLookback4,SPLookforward1,SPLookback0,SPUpDown
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1993-02-16,-0.764963,0.489215,0.139911,-0.694871,-0.071916,-2.522803,-1
1993-02-17,-2.522803,-0.764963,0.489215,0.139911,-0.071907,-0.071916,-1
1993-02-18,-0.071916,-2.522803,-0.764963,0.489215,0.359984,-0.071907,1
1993-02-19,-0.071907,-0.071916,-2.522803,-0.764963,0.358685,0.359984,1
1993-02-22,0.359984,-0.071907,-0.071916,-2.522803,-0.071497,0.358685,-1
...,...,...,...,...,...,...,...
2022-06-01,-0.561098,2.454914,1.998138,0.883490,1.904348,-0.808853,1
2022-06-02,-0.808853,-0.561098,2.454914,1.998138,-1.641152,1.904348,-1
2022-06-03,1.904348,-0.808853,-0.561098,2.454914,0.304477,-1.641152,1
2022-06-06,-1.641152,1.904348,-0.808853,-0.561098,0.959222,0.304477,1


In [52]:
full_df = pd.read_csv('bayes_df.csv', parse_dates=True)
full_df.drop(
        [
            "Unnamed: 0", "Volume", "Adj Close",
            "CPIAUCSL", "VIXCLS", 'SP1', 'SP2'
        ],
        axis=1, inplace=True
    )

In [58]:
full_df.rename({'DATE': 'Date', 'VX': 'Adj Close'}, axis=1, inplace=True)
full_df.set_index('Date')

Unnamed: 0_level_0,Int_Rate,CPI,UNRATE,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1993-02-09,3.25000,143.300,7.0,0.019667
1993-02-10,3.25000,143.300,7.0,-0.003709
1993-02-11,3.25000,143.300,7.0,-0.055101
1993-02-12,3.25000,143.300,7.0,-0.024429
1993-02-16,3.25000,143.300,7.0,0.273021
...,...,...,...,...
2022-06-01,0.73621,288.663,3.6,-0.019091
2022-06-02,0.75289,288.663,3.6,-0.037758
2022-06-03,0.76923,288.663,3.6,0.002832
2022-06-06,0.78491,288.663,3.6,0.011295


In [61]:
lookback = 4
lookforward = 1
down = 0
up = 0
ticker = 'VX'
#create the shifted lookback columns
for i in range(0, lookback):
    full_df[ticker + "Lookback%s" % str(i+1)] = full_df["Adj Close"].shift(i+1)
for i in range(0, lookforward):
    full_df[ticker + "Lookforward%s" % str(i+1)] = full_df["Adj Close"].shift(-(i+1))
full_df.dropna(inplace=True)
#adjust the values to be percentage returns
full_df[ticker + "Lookback0"] = full_df["Adj Close"] *100.0
for i in range(0, lookback):
    full_df[ticker + "Lookback%s" % str(i+1)] = full_df[
        ticker + "Lookback%s" % str(i+1)
    ] *100.0
for i in range(0, lookforward):
    full_df[ticker + "Lookforward%s" % str(i+1)] = full_df[
        ticker + "Lookforward%s" % str(i+1)
    ] *100.0
full_df.dropna(inplace=True)

#create the boolean columns of whether direction is up/zero or down.
down_cols = [
    full_df[ticker + "Lookforward%s" % str(i+1)] > -down
    for i in range(0, lookforward)
]
up_cols = [
    full_df[ticker + "Lookforward%s" % str(i+1)] > up
    for i in range(0, lookforward)
]

#ts["UpDown"] = down_tot & up_tot
full_df[ticker + "UpDown"] = np.sign(full_df[ticker + "Lookforward1"])
# Convert True/False into 1 and 0
full_df[ticker + "UpDown"] = full_df[ticker + "UpDown"].astype(int)
full_df[ticker + "UpDown"].replace(to_replace=0, value=-1, inplace=True)


In [63]:
full_df.drop(
        [
            "Adj Close"
        ],
        axis=1, inplace=True
    )

In [79]:
ts1 = ts.reset_index()
ts1['Date'] = pd.to_datetime(ts1['Date'])
ts1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7382 entries, 0 to 7381
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            7382 non-null   datetime64[ns]
 1   SPLookback1     7382 non-null   float64       
 2   SPLookback2     7382 non-null   float64       
 3   SPLookback3     7382 non-null   float64       
 4   SPLookback4     7382 non-null   float64       
 5   SPLookforward1  7382 non-null   float64       
 6   SPLookback0     7382 non-null   float64       
 7   SPUpDown        7382 non-null   int32         
dtypes: datetime64[ns](1), float64(6), int32(1)
memory usage: 432.7 KB


In [80]:
full_df = full_df.reset_index()
full_df['Date'] = pd.to_datetime(full_df['Date'])
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7248 entries, 0 to 7247
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   index           7248 non-null   int64         
 1   Date            7248 non-null   datetime64[ns]
 2   Int_Rate        7248 non-null   float64       
 3   CPI             7248 non-null   float64       
 4   UNRATE          7248 non-null   float64       
 5   VXLookback1     7248 non-null   float64       
 6   VXLookback2     7248 non-null   float64       
 7   VXLookback3     7248 non-null   float64       
 8   VXLookback4     7248 non-null   float64       
 9   VXLookforward1  7248 non-null   float64       
 10  VXLookback0     7248 non-null   float64       
 11  VXUpDown        7248 non-null   int32         
dtypes: datetime64[ns](1), float64(9), int32(1), int64(1)
memory usage: 651.3 KB


In [82]:
dt_df = full_df.merge(ts1, on='Date')
dt_df

Unnamed: 0,index,Date,Int_Rate,CPI,UNRATE,VXLookback1,VXLookback2,VXLookback3,VXLookback4,VXLookforward1,VXLookback0,VXUpDown,SPLookback1,SPLookback2,SPLookback3,SPLookback4,SPLookforward1,SPLookback0,SPUpDown
0,8,1993-02-22,3.19000,143.300,7.0,-3.470437,-2.138365,0.888325,27.302100,2.312925,-2.130493,1,0.359984,-0.071907,-0.071916,-2.522803,-0.071497,0.358685,-1
1,9,1993-02-23,3.19000,143.300,7.0,-2.130493,-3.470437,-2.138365,0.888325,-2.127660,2.312925,-1,0.358685,0.359984,-0.071907,-0.071916,1.287606,-0.071497,1
2,10,1993-02-24,3.19000,143.300,7.0,2.312925,-2.130493,-3.470437,-2.138365,-6.521739,-2.127660,-1,-0.071497,0.358685,0.359984,-0.071907,0.211827,1.287606,1
3,11,1993-02-25,3.19000,143.300,7.0,-2.127660,2.312925,-2.130493,-3.470437,-4.360465,-6.521739,-1,1.287606,-0.071497,0.358685,0.359984,0.140934,0.211827,1
4,12,1993-02-26,3.21000,143.300,7.0,-6.521739,-2.127660,2.312925,-2.130493,3.343465,-4.360465,1,0.211827,1.287606,-0.071497,0.358685,-0.281502,0.140934,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7243,7251,2022-05-27,0.65250,288.663,3.6,-3.066620,-3.667233,3.405899,-3.227999,1.827372,-6.472727,1,1.998138,0.883490,-0.763378,1.871008,-0.561098,2.454914,-1
7244,7252,2022-05-31,0.71920,288.663,3.6,-6.472727,-3.066620,-3.667233,3.405899,-1.909126,1.827372,-1,2.454914,1.998138,0.883490,-0.763378,-0.808853,-0.561098,-1
7245,7253,2022-06-01,0.73621,288.663,3.6,1.827372,-6.472727,-3.066620,-3.667233,-3.775788,-1.909126,-1,-0.561098,2.454914,1.998138,0.883490,1.904348,-0.808853,1
7246,7254,2022-06-02,0.75289,288.663,3.6,-1.909126,1.827372,-6.472727,-3.066620,0.283172,-3.775788,1,-0.808853,-0.561098,2.454914,1.998138,-1.641152,1.904348,-1


In [84]:
dt_df.to_csv('dec_tree_df.csv')

In [65]:
#Up is for the floor threshold. So what number do you want 
#up to be greater than.  For down specify what number you want to be
#the ceiling threshold

def create_up_down_df(
    csv_filepath,
    ticker,
    lookback,
    lookforward,
    up,
    down,
):
    #convert ticker to string
    ticker = str(ticker)
    #read in dataframe
    ts = pd.read_csv(
        csv_filepath,
        names = [
        "Date", "Open", "Low", "High",
        "Close", "Volume", "Adj Close"
        ],
        index_col = 'Date', parse_dates=True
    )
    
#     #slice to desired start and end
#     if start is not None:
#         ts = ts[ts.index >= start]
#     if end is not None:
#         ts = ts[ts.index <= end]
#     ts.drop(
#         [
#             "High", "Low", "Open",
#             "Close", "Volume"
#         ],
#         axis=1, inplace=True
#     )
    
    #create the shifted lookback columns
    for i in range(0, lookback):
        ts[ticker + "Lookback%s" % str(i+1)] = ts["Adj Close"].shift(i+1)
    for i in range(0, lookforward):
        ts[ticker + "Lookforward%s" % str(i+1)] = ts["Adj Close"].shift(-(i+1))
    ts.dropna(inplace=True)
    #adjust the values to be percentage returns
    ts[ticker + "Lookback0"] = ts["Adj Close"].pct_change()*100.0
    for i in range(0, lookback):
        ts[ticker + "Lookback%s" % str(i+1)] = ts[
            ticker + "Lookback%s" % str(i+1)
        ].pct_change()*100.0
    for i in range(0, lookforward):
        ts[ticker + "Lookforward%s" % str(i+1)] = ts[
            ticker + "Lookforward%s" % str(i+1)
        ].pct_change()*100.0
    ts.dropna(inplace=True)
    
    #create the boolean columns of whether direction is up/zero or down.
    down_cols = [
        ts[ticker + "Lookforward%s" % str(i+1)] > -down
        for i in range(0, lookforward)
    ]
    up_cols = [
        ts[ticker + "Lookforward%s" % str(i+1)] > up
        for i in range(0, lookforward)
    ]
    
   #ts["UpDown"] = down_tot & up_tot
    ts[ticker + "UpDown"] = np.sign(ts[ticker + "Lookforward1"])
    # Convert True/False into 1 and 0
    ts[ticker + "UpDown"] = ts[ticker + "UpDown"].astype(int)
    ts[ticker + "UpDown"].replace(to_replace=0, value=-1, inplace=True)
    return ts

In [66]:
create_up_down_df('spy.csv', 'SP', 5, 1, 0, 0)

TypeError: unsupported operand type(s) for /: 'str' and 'str'

In [None]:
data

In [None]:
X = data[['SP1', 'VX1', 'Int_Rate', 'CPI', 'UNRATE']]
y = data['SP2']

#X['DATE'] = pd.to_datetime(X['DATE'])

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 321)

In [None]:
transformed_columns = ['SP1', 'VX1', 'Int_Rate', 'CPI', 'UNRATE']

In [None]:
pipe = Pipeline(
    steps = [
        ('ct', ColumnTransformer(
        transformers = [
            ('power', PowerTransformer(), transformed_columns)
        ],
            remainder = 'passthrough'
        )),
        ('scaler', StandardScaler()),
        ('linear', RidgeCV())
    ]
)

ttr = TransformedTargetRegressor(
    regressor = pipe,
    #func = np.log,
    #inverse_func = np.exp
)

ttr.fit(X_train, y_train)

print(f'R2 Score: {r2_score(y_test, ttr.predict(X_test))}')
print(f'MAE: {mean_absolute_error(y_test, ttr.predict(X_test))}')