### Dataset Utilities for Algo-Fin Data

In [None]:
import torch
import numpy as np
from torch.utils.data import Dataset
import sklearn.datasets as skds
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import math
from IPython import display
from time import sleep
import pickle
import pandas as pd

In [None]:
import import_ipynb
from feeds import DataFeed, BackFeed, USE_COLS_DICT
from utils import MyDS
from l2lutils import KShotLoader

In [None]:
COLS=['row_num',
 'Open_n_val',
 'High_n_val',
 'Low_n_val',
 'Close_n_val',
 'Volume_n_val',
 'SMA_10_val',
 'SMA_20_val',
 'CMO_14_val',
 'High_n-Low_n_val',
 'Open_n-Close_n_val',
 'SMA_20-SMA_10_val',
 'Close_n_slope_3_val',
 'Close_n_slope_5_val',
 'Close_n_slope_10_val',
 'Open_n_changelen_val',
 'High_n_changelen_val',
 'Low_n_changelen_val',
 'Close_n_changelen_val',
 'High_n-Low_n_changelen_val',
 'Open_n-Close_n_changelen_val',
 'SMA_20-SMA_10_changelen_val',
 'Close_n_slope_3_changelen_val',
 'Close_n_slope_5_changelen_val',
 'Close_n_slope_10_changelen_val']
COLS=COLS+['target_5_val','target_10_val','era','day']

#### Load data and set parameters

In [None]:
DATAPATH='/Users/a112956/DataLocal/fin_regression_summer_proj/'

In [None]:
sigmaL=[[0,0],[0,.05],[.01,0],[.01,.05],[.03,0],[.05,.05],[.075,0],[.075,.05]]

In [None]:
# To get started choose zero NOISE
sid=sigmaL[0]

In [None]:
df_train=pd.read_csv(DATAPATH+f'df_syn_train{sid}.csv')[COLS]
df_test=pd.read_csv(DATAPATH+f'df_syn_test{sid}.csv')[COLS]

In [None]:
df_train_test=pd.read_csv(DATAPATH+f'df_syn_train_test{sid}.csv')[COLS]
df_test_test=pd.read_csv(DATAPATH+f'df_syn_test_test{sid}.csv')[COLS]

In [None]:
df_train.shape,df_test.shape,df_train_test.shape,df_test_test.shape

In [None]:
df_train['era'].unique()

In [None]:
# for getting started choose just one era and train/test from same era
df=df_train.loc[df_train['era']==6]
df=df_test.loc[df_test['era']==6]
trainf=df.iloc[0:int(.8*df.shape[0])]
testf=df.iloc[int(.8*df.shape[0]):]

In [None]:
ds_train=MyDS(trainf.iloc[:,0:-4].values,trainf.iloc[:,-3].values,task='regression')

In [None]:
ds_test=MyDS(testf.iloc[:,0:-4].values,testf.iloc[:,-3].values,task='regression')

#### Regression

In [None]:
from xgboost import XGBRegressor, XGBClassifier
rxf = XGBRegressor(max_depth=3, learning_rate=1.0, \
                     n_estimators=500, colsample_bytree=0.1)

In [None]:
_=rxf.fit(ds_train.samples,ds_train.labels)

In [None]:
# make predictions
predictions_train=rxf.predict(ds_train.samples.numpy())

In [None]:
# compute RMSE
np.sqrt(np.mean((predictions_train-ds_train.labels.numpy())**2))

In [None]:
# make predictions
predictions_test=rxf.predict(ds_test.samples.numpy())

In [None]:
# compute RMSE
np.sqrt(np.mean((predictions_test-ds_test.labels.numpy())**2))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor
rlf = GradientBoostingRegressor(n_estimators=100, learning_rate=1.0, max_depth=3, random_state=0)

In [None]:
_=rlf.fit(ds_train.samples,ds_train.labels)

In [None]:
# make predictions
predictions_train=rlf.predict(ds_train.samples.numpy())

In [None]:
# compute RMSE
np.sqrt(np.mean((predictions_train-ds_train.labels.numpy())**2))

In [None]:
# make predictions
predictions_test=rlf.predict(ds_test.samples.numpy())

In [None]:
# compute RMSE
np.sqrt(np.mean((predictions_test-ds_test.labels.numpy())**2))

#### Classification

In [None]:
cxf=XGBClassifier(max_depth=3, learning_rate=0.01, \
                     n_estimators=500, colsample_bytree=0.1)

In [None]:
_=cxf.fit(ds_train.samples,[int(l*4) for l in ds_train.labels])

In [None]:
# make predictions
predictions_train=cxf.predict(ds_train.samples.numpy())

In [None]:
# compute accuracy
sum([int(p==l) for p,l in zip(predictions_train,[int(l*4) for l in ds_train.labels])])/len(predictions_train)

In [None]:
# make predictions
predictions_test=cxf.predict(ds_test.samples.numpy())

In [None]:
# compute accuracy
sum([int(p==l) for p,l in zip(predictions_test,[int(l*4) for l in ds_test.labels])])/len(predictions_test)

In [None]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=3,random_state=0)

In [None]:
_=clf.fit(ds_train.samples,[int(l*4) for l in ds_train.labels])

In [None]:
# make predictions
predictions_train=clf.predict(ds_train.samples.numpy())

In [None]:
# compute accuracy
sum([int(p==l) for p,l in zip(predictions_train,[int(l*4) for l in ds_train.labels])])/len(predictions_train)

In [None]:
# make predictions
predictions_test=clf.predict(ds_test.samples.numpy())

In [None]:
# compute accuracy
sum([int(p==l) for p,l in zip(predictions_test,[int(l*4) for l in ds_test.labels])])/len(predictions_test)

##### Neural network classifier

In [None]:
import torch
ds_train.labels=np.array([int(l*4) for l in ds_train.labels])
ds_test.labels=np.array([int(l*4) for l in ds_test.labels])

In [None]:
dsloader = torch.utils.data.DataLoader(dataset=ds_train,batch_size=32,shuffle=True)

In [None]:
import import_ipynb
import models
net = models.MLP(dims=[25,128,64,5])

In [None]:
net,losses,accs=models.Train(net,dsloader,epochs=100,verbose=True)

In [None]:
# train accuracy
models.accuracy(net,torch.tensor(ds_train.samples),torch.tensor(ds_train.labels))

In [None]:
# test accuracy
models.accuracy(net,torch.tensor(ds_test.samples),torch.tensor(ds_test.labels))

In [None]:
predictions_train=[(torch.argmax(p,dim=-1)).item() for p in net(ds_train.samples)]

In [None]:
# compute accuracy
sum([int(p==l) for p,l in zip(predictions_train,ds_train.labels)])/len(predictions_train)

In [None]:
predictions_test=[(torch.argmax(p,dim=-1)).item() for p in net(ds_test.samples)]

In [None]:
# compute accuracy
sum([int(p==l) for p,l in zip(predictions_test,ds_test.labels)])/len(predictions_test)

##### RIPPER Rule Learner (there is also IREP in the same package) this needs to be debugged first

In [None]:
ds_train=MyDS(trainf.iloc[:,0:-4].values,trainf.iloc[:,-3].values,task='regression')
ds_test=MyDS(testf.iloc[:,0:-4].values,testf.iloc[:,-3].values,task='regression')

In [None]:
# You will need to install this via pip install wittgenstien
import wittgenstein as lw

In [None]:
ripper_clf = lw.RIPPER(max_rules=4,
        max_rule_conds=2,
        max_total_conds=6)

In [None]:
# np.array([int(l*4) for l in ds_train.labels])

In [None]:
ripper_clf.fit(ds_train.samples.numpy(),np.array([int(l*4) for l in ds_train.labels]),pos_class=1)

In [None]:
ripper_clf.out_model()

In [None]:
# make predictions
predictions_train=ripper_clf.predict(ds_train.samples.numpy())

In [None]:
# predictions_train

In [None]:
def class_accuracy(predictions,y,class_id):
    eq=[(lambda x: 1 if x[0]==x[1] else 0)(x) for x in zip(predictions,y==class_id)]
    return sum(eq)/len(eq)

In [None]:
def class_pos_precision(predictions,y,class_id):
    eq=[(lambda x: 1 if (x[0]==x[1] and x[0]==True) else 0)(x) for x in zip(predictions,y==class_id)]
    return sum(eq)/len(eq)

In [None]:
class_accuracy(predictions_train,np.array([int(l*4) for l in ds_train.labels]),1)

In [None]:
class_pos_precision(predictions_train,np.array([int(l*4) for l in ds_train.labels]),1)

In [None]:
# RIPPER needs debugging - using simpler dataset appears working but not here

##### Differentiable rule network - this will need to be extended as part two of the project

In [None]:
from differentiable_rules import DiffRule

In [None]:
dr= DiffRule(25,5,3,3)

In [None]:
net,losses,accs=models.Train(dr,dsloader,epochs=100,verbose=True)

## Format for Table of Results (to be Created) 
Extend this as needed, i.e. different algorithms, different train-test file combinations

<table>
<thead>
<tr><th>Dataset</th><th>XgbR</th><th>XgbC</th><th>GBR</th></tr>
</thead>
<tbody>
<tr><td></td><td>params..<td>params..</td><td>params..</td></tr>
    <tr><td>file_train</td><td>RMSE</td><td>Acc</td><td>RMSE</td></tr>
    <tr><td>file_test</td><td>RMSE</td><td>Acc</td><td>RMSE</td></tr>
</tbody>
</table>