### Dataset Utilities for Algo-Fin Data

In [None]:
import torch
import numpy as np
from torch.utils.data import Dataset
import sklearn.datasets as skds
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import math
from IPython import display
from time import sleep
import pickle
import pandas as pd
import wittgenstein as lw
from sklearn.decomposition import PCA

In [None]:
import import_ipynb
from feeds import DataFeed, BackFeed, USE_COLS_DICT
from utils import MyDS
import models
#from l2lutils import KShotLoader

In [None]:
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
import torch
import torch.nn as nn
from torch.utils.data import random_split
import warnings
warnings.filterwarnings('ignore')

In [None]:
COLS=['row_num',
 'Open_n_val',
 'High_n_val',
 'Low_n_val',
 'Close_n_val',
 'Volume_n_val',
 'SMA_10_val',
 'SMA_20_val',
 'CMO_14_val',
 'High_n-Low_n_val',
 'Open_n-Close_n_val',
 'SMA_20-SMA_10_val',
 'Close_n_slope_3_val',
 'Close_n_slope_5_val',
 'Close_n_slope_10_val',
 'Open_n_changelen_val',
 'High_n_changelen_val',
 'Low_n_changelen_val',
 'Close_n_changelen_val',
 'High_n-Low_n_changelen_val',
 'Open_n-Close_n_changelen_val',
 'SMA_20-SMA_10_changelen_val',
 'Close_n_slope_3_changelen_val',
 'Close_n_slope_5_changelen_val',
 'Close_n_slope_10_changelen_val']
COLS=COLS+['target_5_val','target_10_val','era','day']
sigmaL=[[0,0],[0,.05],[.01,0],[.01,.05],[.03,0],[.05,.05],[.075,0],[.075,.05]]
DATAPATH='archive/'

In [None]:
DATAPATH='/Users/a112956/DataLocal/fin_regression_summer_proj/'

#### Load data and set parameters

In [None]:
# To get started choose zero NOISE
sid=sigmaL[7]

In [None]:
df_train=pd.read_csv(DATAPATH+f'df_syn_train_{sid[0]}_{sid[1]}_.csv')[COLS]
df_test=pd.read_csv(DATAPATH+f'df_syn_test_{sid[0]}_{sid[1]}_.csv')[COLS]

In [None]:
df_train_test=pd.read_csv(DATAPATH+f'df_syn_train_test_{sid[0]}_{sid[1]}_.csv')[COLS]
df_test_test=pd.read_csv(DATAPATH+f'df_syn_test_test_{sid[0]}_{sid[1]}_.csv')[COLS]

In [None]:
df_train.shape,df_test.shape,df_train_test.shape,df_test_test.shape

In [None]:
eras = df_train['era'].unique()
eras

In [None]:
# for getting started choose just one era and train/test from same era
df=df_train.loc[df_train['era']==9]
# df=df_test.loc[df_test['era']==7]
trainf=df.iloc[0:int(.8*df.shape[0])]
testf=df.iloc[int(.8*df.shape[0]):]

In [None]:
ds_train=MyDS(trainf.iloc[:,0:-4].values,trainf.iloc[:,-3].values,task='regression')

In [None]:
ds_test=MyDS(testf.iloc[:,0:-4].values,testf.iloc[:,-3].values,task='regression')

#### Regression

In [None]:
from xgboost import XGBRegressor, XGBClassifier
rxf = XGBRegressor(max_depth=3, learning_rate=1.0, \
                     n_estimators=500, colsample_bytree=0.1)
_ = rxf.fit(ds_train.samples,ds_train.labels)
predictions_train=rxf.predict(ds_train.samples.numpy())
print(f"TrainRMSE = {np.sqrt(np.mean((predictions_train-ds_train.labels.numpy())**2))}")
predictions_test=rxf.predict(ds_test.samples.numpy())
print(f"TestRMSE = {np.sqrt(np.mean((predictions_test-ds_test.labels.numpy())**2))}")

In [None]:
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor
rlf = GradientBoostingRegressor(n_estimators=100, learning_rate=1.0, max_depth=3, random_state=0)
_ = rlf.fit(ds_train.samples,ds_train.labels)
predictions_train=rlf.predict(ds_train.samples.numpy())
print(f"TrainRMSE = {np.sqrt(np.mean((predictions_train-ds_train.labels.numpy())**2))}")
predictions_test=rlf.predict(ds_test.samples.numpy())
print(f"TestRMSE = {np.sqrt(np.mean((predictions_test-ds_test.labels.numpy())**2))}")

In [None]:
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
rfr = RandomForestRegressor()
_ = rfr.fit(ds_train.samples,ds_train.labels)
predictions_train=rfr.predict(ds_train.samples.numpy())
print(f"TrainRMSE = {np.sqrt(np.mean((predictions_train-ds_train.labels.numpy())**2))}")
predictions_test=rfr.predict(ds_test.samples.numpy())
print(f"TestRMSE = {np.sqrt(np.mean((predictions_test-ds_test.labels.numpy())**2))}")

##### Neural network regressor

In [None]:
import torch
import torch.nn as nn
import import_ipynb
import models
dsloader = torch.utils.data.DataLoader(dataset=ds_train,batch_size=32, shuffle=True)
net = models.MLP(dims=[25, 64, 32, 16], lr = 0.05, task = "regression")

In [None]:
ds_train.samples.shape

In [None]:
net = models.MLP(dims=[25, 128, 64, 32, 5], lr = 0.001, task = "classification")

In [None]:
net,losses,accs=models.Train(net,dsloader,epochs=1000, verbose=True)

In [None]:
print(f"TrainRMSE = {torch.sqrt(torch.mean((net(torch.tensor(ds_train.samples))-torch.tensor(ds_train.labels))**2))}")
print(f"TestRMSE = {torch.sqrt(torch.mean((net(torch.tensor(ds_test.samples))-torch.tensor(ds_test.labels))**2))}")

#### Classification

In [None]:
present_classes = torch.cat((ds_train.labels, ds_test.labels)).unique()
present_classes

In [None]:
cxf=XGBClassifier(max_depth=3, learning_rate=0.01, \
                     n_estimators=500, colsample_bytree=0.1)
_=cxf.fit(ds_train.samples,[int((present_classes==l).nonzero().item()) for l in ds_train.labels])
predictions_train=cxf.predict(ds_train.samples.numpy())
print(f"Train acc = {sum([int(p==l) for p,l in zip(predictions_train,[int((present_classes==l).nonzero().item()) for l in ds_train.labels])])/len(predictions_train)}")
predictions_test=cxf.predict(ds_test.samples.numpy())
print(f"Test acc = {sum([int(p==l) for p,l in zip(predictions_test,[int((present_classes==l).nonzero().item())for l in ds_test.labels])])/len(predictions_test)}")

In [None]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1,max_depth=3,random_state=0)
_=clf.fit(ds_train.samples,[int((present_classes==l).nonzero().item()) for l in ds_train.labels])
predictions_train=clf.predict(ds_train.samples.numpy())
print(f"Train acc = {sum([int(p==l) for p,l in zip(predictions_train,[int((present_classes==l).nonzero().item()) for l in ds_train.labels])])/len(predictions_train)}")
predictions_test=clf.predict(ds_test.samples.numpy())
print(f"Test acc = {sum([int(p==l) for p,l in zip(predictions_test,[int((present_classes==l).nonzero().item())for l in ds_test.labels])])/len(predictions_test)}")

In [None]:
rfc = RandomForestClassifier(n_estimators=100, learning_rate=1,max_depth=3,random_state=0)
_=rfc.fit(ds_train.samples,[int((present_classes==l).nonzero().item()) for l in ds_train.labels])
predictions_train=rfc.predict(ds_train.samples.numpy())
print(f"Train acc = {sum([int(p==l) for p,l in zip(predictions_train,[int((present_classes==l).nonzero().item()) for l in ds_train.labels])])/len(predictions_train)}")
predictions_test=rfc.predict(ds_test.samples.numpy())
print(f"Test acc = {sum([int(p==l) for p,l in zip(predictions_test,[int((present_classes==l).nonzero().item())for l in ds_test.labels])])/len(predictions_test)}")

In [None]:
net, train_acc, test_acc = train_mlp(ds_train, ds_test, verbose = False)

In [None]:
test_acc

##### Neural network classifier

In [None]:
import torch
ds_train.labels=np.array([int((present_classes==l).nonzero().item()) for l in ds_train.labels])
ds_test.labels=np.array([int((present_classes==l).nonzero().item()) for l in ds_test.labels])
dsloader = torch.utils.data.DataLoader(dataset=ds_train,batch_size=32,shuffle=True)

In [None]:
ds_train.labels

In [None]:
import import_ipynb
import models
net = models.MLP(dims=[25, 64, 32, 5], lr = 0.001)
net,losses,accs=models.Train(net,dsloader,epochs=1000,verbose=True)

In [None]:
# train accuracy
print(f"Train Acc = {models.accuracy(net,torch.tensor(ds_train.samples),torch.tensor(ds_train.labels))}")
# test accuracy
print(f"Train Acc = {models.accuracy(net,torch.tensor(ds_test.samples),torch.tensor(ds_test.labels))}")

##### RIPPER Rule Learner (there is also IREP in the same package) this needs to be debugged first

In [None]:
ds_train=MyDS(trainf.iloc[:,0:-4].values,trainf.iloc[:,-3].values,task='regression')
ds_test=MyDS(testf.iloc[:,0:-4].values,testf.iloc[:,-3].values,task='regression')

In [None]:
# You will need to install this via pip install wittgenstien
import wittgenstein as lw

In [None]:
ripper_clf = lw.RIPPER(max_rules=4,
        max_rule_conds=2,
        max_total_conds=6)

In [None]:
# np.array([int(l*4) for l in ds_train.labels])

In [None]:
ripper_clf.fit(ds_train.samples.numpy(),np.array([int((present_classes==l).nonzero().item()) for l in ds_train.labels]),pos_class=1)

In [None]:
ripper_clf.out_model()

In [None]:
# make predictions
predictions_train=ripper_clf.predict(ds_train.samples.numpy())

In [None]:
# predictions_train

In [None]:
def class_accuracy(predictions,y,class_id):
    eq=[(lambda x: 1 if x[0]==x[1] else 0)(x) for x in zip(predictions,y==class_id)]
    return sum(eq)/len(eq)

In [None]:
def class_pos_precision(predictions,y,class_id):
    eq=[(lambda x: 1 if (x[0]==x[1] and x[0]==True) else 0)(x) for x in zip(predictions,y==class_id)]
    return sum(eq)/len(eq)

In [None]:
class_accuracy(predictions_train,np.array([int((present_classes==l).nonzero().item()) for l in ds_train.labels]),1)

In [None]:
class_pos_precision(predictions_train,np.array([int((present_classes==l).nonzero().item()) for l in ds_train.labels]),1)

In [None]:
# RIPPER needs debugging - using simpler dataset appears working but not here

##### Differentiable rule network - this will need to be extended as part two of the project

In [None]:
from differentiable_rules import DiffRule

In [None]:
dr= DiffRule(25,5,3,3)

In [None]:
net,losses,accs=models.Train(dr,dsloader,epochs=100,verbose=True)

## Helper functions

In [None]:
def sample(chunk, rate=0.8):
    n = max(int(len(chunk)*rate), 1)
    return chunk.sample(n=n, replace=True, random_state=1)
    
def StratifiedSampler(data, train_size=0.8):
    traindf = data.groupby('target_10_val', group_keys=False).apply(sample)
    testdf = data.merge(traindf, how="left")
    return traindf, testdf 

In [None]:
def train_xgbr(ds_train, ds_test, verbose = True):
    rxf = XGBRegressor(learning_rate=1.0, \
                         n_estimators=500, colsample_bytree=0.1)
    _ = rxf.fit(ds_train.samples,ds_train.labels)
    predictions_train=rxf.predict(ds_train.samples.numpy())
    predictions_test=rxf.predict(ds_test.samples.numpy())
    train_acc = np.sqrt(np.mean((predictions_train-ds_train.labels.numpy())**2))
    test_acc = np.sqrt(np.mean((predictions_test-ds_test.labels.numpy())**2))
    if verbose:
        print("XGBR")
        print(f"TrainRMSE = {train_acc}")
        print(f"TestRMSE = {test_acc}")
    return rxf, train_acc, test_acc

In [None]:
def train_gbr(ds_train, ds_test, verbose = True):
    rlf = GradientBoostingRegressor(n_estimators=500, learning_rate=1.0, random_state=0)
    _ = rlf.fit(ds_train.samples,ds_train.labels)
    predictions_train=rlf.predict(ds_train.samples.numpy())
    predictions_test=rlf.predict(ds_test.samples.numpy())
    train_acc = np.sqrt(np.mean((predictions_train-ds_train.labels.numpy())**2))
    test_acc = np.sqrt(np.mean((predictions_test-ds_test.labels.numpy())**2))
    if verbose:
        print("GBR")
        print(f"TrainRMSE = {train_acc}")
        print(f"TestRMSE = {test_acc}")
    return rlf, train_acc, test_acc

In [None]:
def train_rfr(ds_train, ds_test, verbose = True):
    rfr = RandomForestRegressor()
    _ = rfr.fit(ds_train.samples,ds_train.labels)
    predictions_train=rfr.predict(ds_train.samples.numpy())
    predictions_test=rfr.predict(ds_test.samples.numpy())
    train_acc = np.sqrt(np.mean((predictions_train-ds_train.labels.numpy())**2))
    test_acc = np.sqrt(np.mean((predictions_test-ds_test.labels.numpy())**2))
    if verbose:
        print("RFR")
        print(f"TrainRMSE = {train_acc}")
        print(f"TestRMSE = {test_acc}")
    return rfr, train_acc, test_acc

In [None]:
def train_xgbc(ds_train, ds_test, verbose = True):
    present_classes = torch.cat((ds_train.labels, ds_test.labels)).unique()
    cxf=XGBClassifier(learning_rate=0.01, \
                         n_estimators=500, colsample_bytree=0.1)
    _=cxf.fit(ds_train.samples,[int((present_classes==l).nonzero().item()) for l in ds_train.labels])
    predictions_train=cxf.predict(ds_train.samples.numpy())
    predictions_test=cxf.predict(ds_test.samples.numpy())
    train_acc = sum([int(p==l) for p,l in zip(predictions_train,[int((present_classes==l).nonzero().item()) for l in ds_train.labels])])/len(predictions_train)
    test_acc = sum([int(p==l) for p,l in zip(predictions_test,[int((present_classes==l).nonzero().item())for l in ds_test.labels])])/len(predictions_test)
    if verbose:
        print("XGBC")
        print(f"Train acc = {train_acc}")
        print(f"Test acc = {test_acc}")
    return cxf, train_acc, test_acc

In [None]:
def train_gbc(ds_train, ds_test, verbose = True):
    present_classes = torch.cat((ds_train.labels, ds_test.labels)).unique()
    clf = GradientBoostingClassifier(n_estimators=500, max_depth= None,random_state=0)
    _=clf.fit(ds_train.samples,[int((present_classes==l).nonzero().item()) for l in ds_train.labels])
    predictions_train=clf.predict(ds_train.samples.numpy())
    predictions_test=clf.predict(ds_test.samples.numpy())
    train_acc = sum([int(p==l) for p,l in zip(predictions_train,[int((present_classes==l).nonzero().item()) for l in ds_train.labels])])/len(predictions_train)
    test_acc = sum([int(p==l) for p,l in zip(predictions_test,[int((present_classes==l).nonzero().item())for l in ds_test.labels])])/len(predictions_test)
    if verbose:
        print("GBC")
        print(f"Train acc = {train_acc}")
        print(f"Test acc = {test_acc}")
    return clf, train_acc, test_acc

In [None]:
def train_rfc(ds_train, ds_test, verbose = True):
    present_classes = torch.cat((ds_train.labels, ds_test.labels)).unique()
    rfc = RandomForestClassifier(n_estimators=300, max_depth= None,random_state=0)
    _=rfc.fit(ds_train.samples,[int((present_classes==l).nonzero().item()) for l in ds_train.labels])
    predictions_train=rfc.predict(ds_train.samples.numpy())
    predictions_test=rfc.predict(ds_test.samples.numpy())
    train_acc = sum([int(p==l) for p,l in zip(predictions_train,[int((present_classes==l).nonzero().item()) for l in ds_train.labels])])/len(predictions_train)
    test_acc = sum([int(p==l) for p,l in zip(predictions_test,[int((present_classes==l).nonzero().item())for l in ds_test.labels])])/len(predictions_test)
    if verbose:
        print("RFC")
        print(f"Train acc = {train_acc}")
        print(f"Test acc = {test_acc}")
    return rfc, train_acc, test_acc

In [None]:
def train_ripper(ds_train, ds_test, verbose = True, max_rules = 15, total_conds =20):
    present_classes = torch.cat((ds_train.labels, ds_test.labels)).unique()
    ripper_clf = lw.RIPPER(max_rules=max_rules)
    _=ripper_clf.fit(ds_train.samples,[int((present_classes==l).nonzero().item()) for l in ds_train.labels])
    predictions_train=ripper_clf.predict(ds_train.samples.numpy())
    predictions_test=ripper_clf.predict(ds_test.samples.numpy())
    train_acc = sum([int(p==l) for p,l in zip(predictions_train,[int((present_classes==l).nonzero().item()) for l in ds_train.labels])])/len(predictions_train)
    test_acc = sum([int(p==l) for p,l in zip(predictions_test,[int((present_classes==l).nonzero().item())for l in ds_test.labels])])/len(predictions_test) 
    if verbose:
        print("Ripper")
        print(f"Train acc = {train_acc}")
        print(f"Test acc = {test_acc}")
    return ripper_clf.out_model(), train_acc, test_acc

In [None]:
import import_ipynb
import models
def train_mlp(ds_train, ds_test, verbose = True, dims=[25, 128, 64, 32, 5], lr = 0.001, epochs = 1000):
    present_classes = torch.cat((ds_train.labels, ds_test.labels)).unique()
    ds_train.labels=np.array([int((present_classes==l).nonzero().item()) for l in ds_train.labels])
    ds_test.labels=np.array([int((present_classes==l).nonzero().item()) for l in ds_test.labels])
    dsloader = torch.utils.data.DataLoader(dataset=ds_train,batch_size=32,shuffle=True)
    net = models.MLP(dims=dims, lr=lr)
    net,losses,accs=models.Train(net,dsloader,epochs=epochs,verbose=True)
    train_acc = models.accuracy(net,torch.tensor(ds_train.samples),torch.tensor(ds_train.labels), verbose = False)
    test_acc = models.accuracy(net,torch.tensor(ds_test.samples),torch.tensor(ds_test.labels), verbose = False)
    if verbose:
        print("MLPC")
        print(f"Train acc = {train_acc}")
        print(f"Test acc = {test_acc}")
    return net, train_acc, test_acc



## df_syn_train_x_x files

In [None]:
##training and testing on the same set of eras
def train_function(sid, table):
    df_train=pd.read_csv(DATAPATH+f'df_syn_train_{sid[0]}_{sid[1]}_.csv')[COLS]
    df_test=pd.read_csv(DATAPATH+f'df_syn_test_{sid[0]}_{sid[1]}_.csv')[COLS]
    eras = df_train['era'].unique()
    trainf = pd.DataFrame()
    testf = pd.DataFrame()
    for e in eras:
        df=df_train.loc[df_train['era'] == e]
        train, test = df.iloc[0:int(0.8*df.shape[0])], df.iloc[int(0.8*df.shape[0]):] 
        trainf = pd.concat([trainf, train])
        testf = pd.concat([testf, test])

     
    ds_train=MyDS(trainf.iloc[:,0:-4].values,trainf.iloc[:,-3].values,task='regression')
    ds_test=MyDS(testf.iloc[:,0:-4].values,testf.iloc[:,-3].values,task='regression')

    rxf, train_acc, test_acc = train_xgbr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    rlf, train_acc, test_acc = train_gbr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    rfr, train_acc, test_acc = train_rfr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    ds_train=MyDS(trainf.iloc[:,0:-4].values,trainf.iloc[:,-3].values,task='classification')
    ds_test=MyDS(testf.iloc[:,0:-4].values,testf.iloc[:,-3].values,task='classification')

    #cxf, train_acc, test_acc = train_xgbc(ds_train, ds_test, verbose = False)
    #table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    table += f"<td>0</td><td>0</td>"

    clf, train_acc, test_acc = train_gbc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    rfc, train_acc, test_acc = train_rfc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    ripper, train_acc, test_acc = train_ripper(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    net, train_acc, test_acc = train_mlp(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"


    return table
from tqdm import tqdm
table = "<table>\n"
table += "<thead><caption>Training on data from all eras with a particular noise (df_syn_train_x_x)</caption>\n"
table += "<tr><th colspan=\"1\">Train Dataset</th><th colspan=\"1\">Test Dataset</th>"
table += "<th colspan=\"2\">XgbR</th><th colspan=\"2\">GBR</th><th colspan=\"2\">RFR</th><th colspan=\"2\">XgbC</th><th colspan=\"2\">GBC</th><th colspan=\"2\">RFC</th><th colspan=\"2\">Ripper</th><th colspan=\"2\">MLP C</th></tr>\n"
table += "</thead>\n<tbody>\n"
table += "<tr><td></td><td></td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td></tr>"
for sid in tqdm(sigmaL):
    table += f"<tr><td>df_syn_train_{sid}</td><td>df_syn_test_{sid}</td>"
    table = train_function(sid, table)
    table += "</tr>\n"

table += "</tbody></table>\n"
print(table)

<table>
<thead><caption>Training on data from all eras with a particular noise (df_syn_train_x_x)</caption>
<tr><th colspan="1">Train Dataset</th><th colspan="1">Test Dataset</th><th colspan="2">XgbR</th><th colspan="2">GBR</th><th colspan="2">RFR</th><th colspan="2">GBC</th><th colspan="2">RFC</th><th colspan="2">Ripper</th><th colspan="2">MLP C</th></tr>
</thead>
<tbody>
<tr><td></td><td></td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td></tr><tr><td>df_syn_train_[0, 0]</td><td>df_syn_test_[0, 0]</td><td>0.053</td><td>0.059</td><td>0.034</td><td>0.059</td><td>0.019</td><td>0.027</td><td>0.999</td><td>0.997</td><td>0.999</td><td>0.998</td><td>0.965</td><td>0.958</td><td>0.997</td><td>0.994</td></tr>
<tr><td>df_syn_train_[0, 0.05]</td><td>df_syn_test_[0, 0.05]</td><td>0.080</td><td>0.125</td><td>0.054</td><td>0.116</td><td>0.035</td><td>0.100</td><td>0.996</td><td>0.955</td><td>0.996</td><td>0.961</td><td>0.944</td><td>0.923</td><td>0.989</td><td>0.954</td></tr>
<tr><td>df_syn_train_[0.01, 0]</td><td>df_syn_test_[0.01, 0]</td><td>0.182</td><td>0.296</td><td>0.142</td><td>0.285</td><td>0.093</td><td>0.248</td><td>1.000</td><td>0.853</td><td>1.000</td><td>0.899</td><td>0.857</td><td>0.868</td><td>0.991</td><td>0.865</td></tr>
<tr><td>df_syn_train_[0.01, 0.05]</td><td>df_syn_test_[0.01, 0.05]</td><td>0.168</td><td>0.268</td><td>0.131</td><td>0.270</td><td>0.083</td><td>0.224</td><td>1.000</td><td>0.872</td><td>1.000</td><td>0.924</td><td>0.883</td><td>0.897</td><td>0.992</td><td>0.874</td></tr>
<tr><td>df_syn_train_[0.03, 0]</td><td>df_syn_test_[0.03, 0]</td><td>0.212</td><td>0.342</td><td>0.166</td><td>0.342</td><td>0.110</td><td>0.303</td><td>1.000</td><td>0.787</td><td>1.000</td><td>0.842</td><td>0.828</td><td>0.808</td><td>0.954</td><td>0.785</td></tr>
<tr><td>df_syn_train_[0.05, 0.05]</td><td>df_syn_test_[0.05, 0.05]</td><td>0.225</td><td>0.358</td><td>0.174</td><td>0.352</td><td>0.116</td><td>0.312</td><td>1.000</td><td>0.747</td><td>1.000</td><td>0.830</td><td>0.813</td><td>0.799</td><td>0.952</td><td>0.768</td></tr>
<tr><td>df_syn_train_[0.075, 0]</td><td>df_syn_test_[0.075, 0]</td><td>0.229</td><td>0.368</td><td>0.178</td><td>0.373</td><td>0.117</td><td>0.330</td><td>1.000</td><td>0.712</td><td>1.000</td><td>0.789</td><td>0.796</td><td>0.793</td><td>0.937</td><td>0.752</td></tr>
<tr><td>df_syn_train_[0.075, 0.05]</td><td>df_syn_test_[0.075, 0.05]</td><td>0.228</td><td>0.352</td><td>0.176</td><td>0.359</td><td>0.117</td><td>0.312</td><td>1.000</td><td>0.744</td><td>1.000</td><td>0.828</td><td>0.792</td><td>0.786</td><td>0.938</td><td>0.779</td></tr>
</tbody></table>

In [None]:
##training on a set of eras but testing on different set
def train_function_diff_eras(sid, table):
    df_train=pd.read_csv(DATAPATH+f'df_syn_train_{sid[0]}_{sid[1]}_.csv')[COLS]
    df_test=pd.read_csv(DATAPATH+f'df_syn_train_test_{sid[0]}_{sid[1]}_.csv')[COLS]
    df=df_train
    df=df_test
    trainf=df_train
    testf=df_test

    ds_train=MyDS(trainf.iloc[:,0:-4].values,trainf.iloc[:,-3].values,task='regression')
    ds_test=MyDS(testf.iloc[:,0:-4].values,testf.iloc[:,-3].values,task='regression')
    
    rxf, train_acc, test_acc = train_xgbr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
    rlf, train_acc, test_acc = train_gbr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.2f}</td>"
    
    rfr, train_acc, test_acc = train_rfr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    ds_train=MyDS(trainf.iloc[:,0:-4].values,trainf.iloc[:,-3].values,task='classification')
    ds_test=MyDS(testf.iloc[:,0:-4].values,testf.iloc[:,-3].values,task='classification')
    
    cxf, train_acc, test_acc = train_xgbc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
    clf, train_acc, test_acc = train_gbc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
    rfc, train_acc, test_acc = train_rfc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    ripper, train_acc, test_acc = train_ripper(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
    net, train_acc, test_acc = train_mlp(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    return table
    
table = "<table>\n"
table += "<thead><caption>Training on data from a set eras but testing on different eras (train:df_syn_train_x_x, test:df_syn_train_test_x_x)</caption>\n"
table += "<tr><th colspan=\"1\">Train Dataset</th><th colspan=\"1\">Test Dataset</th>"
table += "<th colspan=\"2\">XgbR</th><th colspan=\"2\">GBR</th><th colspan=\"2\">RFR</th><th colspan=\"2\">XgbC</th><th colspan=\"2\">GBC</th><th colspan=\"2\">RFC</th><th colspan=\"2\">Ripper</th><th colspan=\"2\">MLP C</th></tr>\n"
table += "</thead>\n<tbody>\n"
table += "<tr><td></td><td></td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td></tr>"
for sid in tqdm(sigmaL):
    table += f"<tr><td>df_syn_train_{sid}</td><td>df_syn_train_test_{sid}</td>"
    table = train_function_diff_eras(sid, table)
    table += "</tr>\n"

table += "</tbody></table>\n"
print(table)

In [None]:
##training and testing on the same era
def train_function_eras(sid, table):
    df_train=pd.read_csv(DATAPATH+f'df_syn_train_{sid[0]}_{sid[1]}_.csv')[COLS]
    df_test=pd.read_csv(DATAPATH+f'df_syn_test_{sid[0]}_{sid[1]}_.csv')[COLS]
    trainxgbr = []
    testxgbr = []
    traingbr = []
    testgbr = []
    trainrfr = []
    testrfr = []
    trainxgbc = []
    testxgbc = []
    traingbc = []
    testgbc = []
    trainrfc = []
    testrfc = []
    trainmlpc = []
    testmlpc = []
    trainrip = []
    testrip = []
    eras = df_train['era'].unique()
    
    for e in eras:
        df=df_train.loc[df_train['era'] == e]
        trainf=df.iloc[0:int(.8*df.shape[0])]
        testf=df.iloc[int(.8*df.shape[0]):]
        ds_train=MyDS(trainf.iloc[:,0:-4].values,trainf.iloc[:,-3].values,task='regression')
        ds_test=MyDS(testf.iloc[:,0:-4].values,testf.iloc[:,-3].values,task='regression')
        present_classes = torch.cat((ds_train.labels, ds_test.labels)).unique()
        if len(present_classes)==1:
            continue
        
        
        rxf, train_acc, test_acc = train_xgbr(ds_train, ds_test, verbose = False)
        trainxgbr.append(train_acc)
        testxgbr.append(test_acc)
        
        rlf, train_acc, test_acc = train_gbr(ds_train, ds_test, verbose = False)
        traingbr.append(train_acc)
        testgbr.append(test_acc)
        
        rfr, train_acc, test_acc = train_rfr(ds_train, ds_test, verbose = False)
        trainrfr.append(train_acc)
        testrfr.append(test_acc)

        ds_train=MyDS(trainf.iloc[:,0:-4].values,trainf.iloc[:,-3].values,task='classification')
        ds_test=MyDS(testf.iloc[:,0:-4].values,testf.iloc[:,-3].values,task='classification')
        present_classes = torch.cat((ds_train.labels, ds_test.labels)).unique()
        if len(present_classes)==1:
            continue
            
        #cxf, train_acc, test_acc = train_xgbc(ds_train, ds_test, verbose = False)
        #trainxgbc.append(train_acc)
        #testxgbc.append(test_acc)
        
        clf, train_acc, test_acc = train_gbc(ds_train, ds_test, verbose = False)
        traingbc.append(train_acc)
        testgbc.append(test_acc)
        
        rfc, train_acc, test_acc = train_rfc(ds_train, ds_test, verbose = False)
        trainrfc.append(train_acc)
        testrfc.append(test_acc)

        ripper, train_acc, test_acc = train_ripper(ds_train, ds_test, verbose = False)
        trainrip.append(train_acc)
        testrip.append(test_acc)
    
        net, train_acc, test_acc = train_mlp(ds_train, ds_test, epochs = 500, verbose = False)
        trainmlpc.append(train_acc)
        testmlpc.append(test_acc)


    table += f"<td>{sum(trainxgbr)/len(trainxgbr):.2f}</td><td>{sum(testxgbr)/len(testxgbr):.2f}</td>"
    
    table += f"<td>{sum(traingbr)/len(traingbr):.2f}</td><td>{sum(testgbr)/len(testgbr):.2f}</td>"
    
    table += f"<td>{sum(trainrfr)/len(trainrfr):.2f}</td><td>{sum(testrfr)/len(testrfr):.2f}</td>"
    
    #table += f"<td>{sum(trainxgbc)/len(trainxgbc):.2f}</td><td>{sum(testxgbc)/len(testxgbc):.2f}</td>"
    table += f"<td>0</td><td>0</td>"
    
    table += f"<td>{sum(traingbc)/len(traingbc):.2f}</td><td>{sum(testgbc)/len(testgbc):.2f}</td>"
    
    table += f"<td>{sum(trainrfc)/len(trainrfc):.2f}</td><td>{sum(testrfc)/len(testrfc):.2f}</td>"

    table += f"<td>{sum(trainrip)/len(trainrip):.2f}</td><td>{sum(testrip)/len(testrip):.2f}</td>"
    
    table += f"<td>{sum(trainmlpc)/len(trainmlpc):.2f}</td><td>{sum(testmlpc)/len(testmlpc):.2f}</td>"


    return table

table = "<table>\n"
table += "<thead><caption>Training on data from all eras with a particular noise (df_syn_train_x_x)</caption>\n"
table += "<tr><th colspan=\"1\">Train Dataset</th><th colspan=\"1\">Test Dataset</th>"
table += "<th colspan=\"2\">XgbR</th><th colspan=\"2\">GBR</th><th colspan=\"2\">RFR</th><th colspan=\"2\">XgbC</th><th colspan=\"2\">GBC</th><th colspan=\"2\">RFC</th><th colspan=\"2\">Ripper</th><th colspan=\"2\">MLP C</th></tr>\n"
table += "</thead>\n<tbody>\n"
table += "<tr><td></td><td></td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td></tr>"
for sid in tqdm(sigmaL):
    table += f"<tr><td>df_syn_train_{sid}</td><td>df_syn_test_{sid}</td>"
    table = train_function_eras(sid, table)
    table += "</tr>\n"

table += "</tbody></table>\n"
print(table)

In [None]:
def getCombinations(s):
    df_train=pd.read_csv(DATAPATH+f'df_syn_train_{s[0]}_{s[1]}_.csv')[COLS]
    df_test=pd.read_csv(DATAPATH+f'df_syn_train_test_{s[0]}_{s[1]}_.csv')[COLS]
    eras = df_train['era'].unique()
    nErasTrain = []
    for era in eras:
        df=df_train.loc[df_train['era'] == era]
        if df['target_10_val'].nunique() == 5:
            nErasTrain.append(era)
    eras = df_test['era'].unique()
    nErasTest = []
    for era in eras:
        df=df_test.loc[df_test['era'] == era]
        if df['target_10_val'].nunique() == 5:
            nErasTest.append(era)

    if len(nErasTrain) < len(nErasTest):
        return zip(nErasTrain * 2, nErasTest)
    elif len(nErasTrain) > len(nErasTest):
        return zip(nErasTrain, nErasTest * 2)
    return zip(nErasTrain, nErasTest)

In [None]:
## train on a single era and test on a different era.
def train_function_eras_diff_eras(sid, table):
    df_train=pd.read_csv(DATAPATH+f'df_syn_train_{sid[0]}_{sid[1]}_.csv')[COLS]
    df_test=pd.read_csv(DATAPATH+f'df_syn_train_test_{sid[0]}_{sid[1]}_.csv')[COLS]
    df=df_train
    df=df_test
    for train_era, test_era in getCombinations(sid):
        table += f"<tr><td>df_syn_train_{sid} Era: {train_era}</td><td>df_syn_train_test_{sid} Era: {test_era}</td>"
        trainf=df_train.loc[df_train['era'] == train_era]
        testf=df_test.loc[df_test['era'] == test_era]
        
        ds_train=MyDS(trainf.iloc[:,0:-4].values,trainf.iloc[:,-3].values,task='regression')
        ds_test=MyDS(testf.iloc[:,0:-4].values,testf.iloc[:,-3].values,task='regression')

        rxf, train_acc, test_acc = train_xgbr(ds_train, ds_test, verbose=False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
        rlf, train_acc, test_acc = train_gbr(ds_train, ds_test, verbose=False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
        rfr, train_acc, test_acc = train_rfr(ds_train, ds_test, verbose=False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

        ds_train=MyDS(trainf.iloc[:,0:-4].values,trainf.iloc[:,-3].values,task='classification')
        ds_test=MyDS(testf.iloc[:,0:-4].values,testf.iloc[:,-3].values,task='classification')
    
        #cxf, train_acc, test_acc = train_xgbc(ds_train, ds_test, verbose = False)
        #table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
        table += f"<td>0</td><td>0</td>"
    
        clf, train_acc, test_acc = train_gbc(ds_train, ds_test, verbose=False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
        rfc, train_acc, test_acc = train_rfc(ds_train, ds_test, verbose=False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

        ripper, train_acc, test_acc = train_ripper(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
        net, train_acc, test_acc = train_mlp(ds_train, ds_test, verbose=False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

        
        table += "</tr>"
        

    return table


table = "<table>\n"
table += "<thead><caption>Training on data from a particular era and test on a different era (df_syn_train_x_x, df_syn_test_x_x)</caption>\n"
table += "<tr><th colspan=\"1\">Train Dataset</th><th colspan=\"1\">Test Dataset</th>"
table += "<th colspan=\"2\">XgbR</th><th colspan=\"2\">GBR</th><th colspan=\"2\">RFR</th><th colspan=\"2\">XgbC</th><th colspan=\"2\">GBC</th><th colspan=\"2\">RFC</th><th colspan=\"2\">Ripper</th><th colspan=\"2\">MLP C</th></tr>\n"
table += "</thead>\n<tbody>\n"
table += "<tr><td></td><td></td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td></tr>"
for sid in tqdm(sigmaL):
    table = train_function_eras_diff_eras(sid, table)

table += "</tbody></table>\n"
print(table)

In [None]:
## Training on data with no noise and testing on data with varying levels of noise
def train_function(sid, table):
    df_train=pd.read_csv(DATAPATH+f'df_syn_train_0_0_.csv')[COLS]
    df_test=pd.read_csv(DATAPATH+f'df_syn_train_{sid[0]}_{sid[1]}_.csv')[COLS]
    df=df_train
    df=df_test
    trainf=df_train
    testf=df_test

    ds_train=MyDS(trainf.iloc[:,0:-4].values,trainf.iloc[:,-3].values,task='regression')
    ds_test=MyDS(testf.iloc[:,0:-4].values,testf.iloc[:,-3].values,task='regression')
    
    rxf, train_acc, test_acc = train_xgbr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
    rlf, train_acc, test_acc = train_gbr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
    rfr, train_acc, test_acc = train_rfr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    ds_train=MyDS(trainf.iloc[:,0:-4].values,trainf.iloc[:,-3].values,task='classification')
    ds_test=MyDS(testf.iloc[:,0:-4].values,testf.iloc[:,-3].values,task='classification')
    
    cxf, train_acc, test_acc = train_xgbc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
    clf, train_acc, test_acc = train_gbc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
    rfc, train_acc, test_acc = train_rfc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    ripper, train_acc, test_acc = train_ripper(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
    net, train_acc, test_acc = train_mlp(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    return table
    
table = "<table>\n"
table += "<thead><caption>Training on data with no noise and testing on data with varying levels of noise</caption>\n"
table += "<tr><th colspan=\"1\">Train Dataset</th><th colspan=\"1\">Test Dataset</th>"
table += "<th colspan=\"2\">XgbR</th><th colspan=\"2\">GBR</th><th colspan=\"2\">RFR</th><th colspan=\"2\">XgbC</th><th colspan=\"2\">GBC</th><th colspan=\"2\">RFC</th><th colspan=\"2\">Ripper</th><th colspan=\"2\">MLP C</th></tr>\n"
table += "</thead>\n<tbody>\n"
table += "<tr><td></td><td></td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td></tr>"
for sid in tqdm(sigmaL[1:]):
    table += f"<tr><td>df_syn_train_0_0</td><td>df_syn_train_{sid}</td>"
    table = train_function(sid, table)
    table += "</tr>\n"

table += "</tbody></table>\n"
print(table)

In [None]:
## Training on data with noise and testing on data with no noise
def train_function(sid, table):
    df_test=pd.read_csv(DATAPATH+f'df_syn_train_0_0_.csv')[COLS]
    df_train=pd.read_csv(DATAPATH+f'df_syn_train_{sid[0]}_{sid[1]}_.csv')[COLS]
    df=df_train
    df=df_test
    trainf=df_train
    testf=df_test

    ds_train=MyDS(trainf.iloc[:,0:-4].values,trainf.iloc[:,-3].values,task='regression')
    ds_test=MyDS(testf.iloc[:,0:-4].values,testf.iloc[:,-3].values,task='regression')
    
    rxf, train_acc, test_acc = train_xgbr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
    rlf, train_acc, test_acc = train_gbr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
    rfr, train_acc, test_acc = train_rfr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    ds_train=MyDS(trainf.iloc[:,0:-4].values,trainf.iloc[:,-3].values,task='classification')
    ds_test=MyDS(testf.iloc[:,0:-4].values,testf.iloc[:,-3].values,task='classification')
    
    cxf, train_acc, test_acc = train_xgbc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
    clf, train_acc, test_acc = train_gbc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
    rfc, train_acc, test_acc = train_rfc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    ripper, train_acc, test_acc = train_ripper(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
    net, train_acc, test_acc = train_mlp(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"


    return table
    
table = "<table>\n"
table += "<thead><caption>Training on data with noise and testing on data with no noise</caption>\n"
table += "<tr><th colspan=\"1\">Train Dataset</th><th colspan=\"1\">Test Dataset</th>"
table += "<th colspan=\"2\">XgbR</th><th colspan=\"2\">GBR</th><th colspan=\"2\">RFR</th><th colspan=\"2\">XgbC</th><th colspan=\"2\">GBC</th><th colspan=\"2\">RFC</th><th colspan=\"2\">Ripper</th><th colspan=\"2\">MLP C</th></tr>\n"
table += "</thead>\n<tbody>\n"
table += "<tr><td></td><td></td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td></tr>"
for sid in tqdm(sigmaL[1:]):
    table += f"<tr><td>df_syn_train_{sid}</td><td>df_syn_train_0_0</td>"
    table = train_function(sid, table)
    table += "</tr>\n"

table += "</tbody></table>\n"
print(table)

## df_train df_test, df_val files

In [None]:
ds1 = pd.read_csv(DATAPATH+"df_train.csv")
ds1['era'].unique()

In [None]:
ds = pd.read_csv(DATAPATH+"df_test.csv")
ds['era'].unique()

In [None]:
ds.columns

In [None]:
ds = pd.read_csv(DATAPATH+"df_val.csv")
ds['era'].unique()

In [None]:
ds = pd.read_csv(DATAPATH+"df_val_test.csv")
ds['era'].unique()

In [None]:
## training on multiple eras and test on same set of eras
def train_function(sid, table, train, test):
    df_train=pd.read_csv(train)
    df_test=pd.read_csv(test)
    eras = df_train['era'].unique()
    trainf = pd.DataFrame()
    testf = pd.DataFrame()
    for e in eras:
        df=df_train.loc[df_train['era'] == e]
        train, test = df.iloc[0:int(0.8*df.shape[0])], df.iloc[int(0.8*df.shape[0]):] 
        trainf = pd.concat([trainf, train])
        testf = pd.concat([testf, test])
    testf = df_test
    ds_train=MyDS(trainf.iloc[:,0:-3].values,trainf.iloc[:,-2].values,task='regression')
    ds_test=MyDS(testf.iloc[:,0:-3].values,testf.iloc[:,-2].values,task='regression')

    rxf, train_acc, test_acc = train_xgbr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    rlf, train_acc, test_acc = train_gbr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    rfr, train_acc, test_acc = train_rfr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    ds_train=MyDS(trainf.iloc[:,0:-3].values,trainf.iloc[:,-2].values,task='classification')
    ds_test=MyDS(testf.iloc[:,0:-3].values,testf.iloc[:,-2].values,task='classification')

    #cxf, train_acc, test_acc = train_xgbc(ds_train, ds_test, verbose = False)
    #table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    table += f"<td>0</td><td>0</td>"

    clf, train_acc, test_acc = train_gbc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    rfc, train_acc, test_acc = train_rfc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    ripper, train_acc, test_acc = train_ripper(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    net, train_acc, test_acc = train_mlp(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    return table
from tqdm import tqdm
table = "<table>\n"
table += "<thead><caption>Training on data from multiple eras and testing on the same eras</caption>\n"
table += "<tr><th colspan=\"1\">Train Dataset</th><th colspan=\"1\">Test Dataset</th>"
table += "<th colspan=\"2\">XgbR</th><th colspan=\"2\">GBR</th><th colspan=\"2\">RFR</th><th colspan=\"2\">XgbC</th><th colspan=\"2\">GBC</th><th colspan=\"2\">RFC</th><th colspan=\"2\">Ripper</th><th colspan=\"2\">MLP C</th></tr>\n"
table += "</thead>\n<tbody>\n"
table += "<tr><td></td><td></td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td></tr>"

table += f"<tr><td>df_syn_train</td><td>df_syn_test</td>"
table = train_function(sid, table, DATAPATH+f'df_train.csv', DATAPATH+f'df_test.csv')
table += "</tr>\n"

table += "</tbody></table>\n"
print(table)

In [None]:
##Training on data from set of eras and testing on a different set of eras
def train_function(table, train, test):
    df_train=pd.read_csv(train)
    df_test=pd.read_csv(test)
    trainf = df_train.iloc[0:int(df_train.shape[0])]
    testf = df_test
    ds_train=MyDS(trainf.iloc[:,0:-3].values,trainf.iloc[:,-2].values,task='regression')
    ds_test=MyDS(testf.iloc[:,0:-3].values,testf.iloc[:,-2].values,task='regression')
    

    rxf, train_acc, test_acc = train_xgbr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    rlf, train_acc, test_acc = train_gbr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    rfr, train_acc, test_acc = train_rfr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    ds_train=MyDS(trainf.iloc[:,0:-3].values,trainf.iloc[:,-2].values,task='classification')
    ds_test=MyDS(testf.iloc[:,0:-3].values,testf.iloc[:,-2].values,task='classification')
    
    cxf, train_acc, test_acc = train_xgbc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    clf, train_acc, test_acc = train_gbc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    rfc, train_acc, test_acc = train_rfc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    ripper, train_acc, test_acc = train_ripper(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

    net, train_acc, test_acc = train_mlp(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"


    return table
from tqdm import tqdm
print("table")
table = "<table>\n"
table += "<thead><caption>Training on data from a single era and testing on a different era</caption>\n"
table += "<tr><th colspan=\"1\">Train Dataset</th><th colspan=\"1\">Test Dataset</th>"
table += "<th colspan=\"2\">XgbR</th><th colspan=\"2\">GBR</th><th colspan=\"2\">RFR</th><th colspan=\"2\">XgbC</th><th colspan=\"2\">GBC</th><th colspan=\"2\">RFC</th><th colspan=\"2\">Ripper</th><th colspan=\"2\">MLP C</th></tr>\n"
table += "</thead>\n<tbody>\n"
table += "<tr><td></td><td></td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td></tr>"

table += f"<tr><td>df_val</td><td>df_val_test</td>"
table = train_function(table, DATAPATH+f'df_val.csv', DATAPATH+f'df_val_test.csv')
table += "</tr>\n"
table += f"<tr><td>df_train</td><td>df_val</td>"
table = train_function(table, DATAPATH+f'df_train.csv', DATAPATH+f'df_val.csv')
table += "</tr>\n"

table += "</tbody></table>\n"
print(table)

In [None]:
##Training on data from a single era and testing it on a different era
def train_function(table, train, test):
    df_train=pd.read_csv(train)
    df_test=pd.read_csv(test)
    trainf = df_train
    testf = df_test
    ds_train=MyDS(trainf.iloc[:,0:-3].values,trainf.iloc[:,-2].values,task='regression')
    ds_test=MyDS(testf.iloc[:,0:-3].values,testf.iloc[:,-2].values,task='regression')
    train_eras = trainf['era'].unique()
    test_eras = testf['era'].unique()
    for train_era, test_era in zip(train_eras, test_eras):
        
        traindf = trainf.loc[trainf['era']==train_era]
        testdf = testf.loc[testf['era']==test_era]
        
        ds_train=MyDS(traindf.iloc[:,0:-3].values,traindf.iloc[:,-2].values,task='regression')
        ds_test=MyDS(testdf.iloc[:,0:-3].values,testdf.iloc[:,-2].values,task='regression')

        table += f"<tr><td>{train} {train_era}</td><td>{test} {test_era}</td>"
        
        rxf, train_acc, test_acc = train_xgbr(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
        rlf, train_acc, test_acc = train_gbr(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
        rfr, train_acc, test_acc = train_rfr(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

        ds_train=MyDS(trainf.iloc[:,0:-3].values,trainf.iloc[:,-2].values,task='classification')
        ds_test=MyDS(testf.iloc[:,0:-3].values,testf.iloc[:,-2].values,task='classification')
    
        cxf, train_acc, test_acc = train_xgbc(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
        clf, train_acc, test_acc = train_gbc(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
        rfc, train_acc, test_acc = train_rfc(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

        ripper, train_acc, test_acc = train_ripper(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
        net, train_acc, test_acc = train_mlp(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

        table += "</tr>\n"

    return table
from tqdm import tqdm
print("table")
table = "<table>\n"
table += "<thead><caption>Training on data from a single era and testing it on a different era</caption>\n"
table += "<tr><th colspan=\"1\">Train Dataset</th><th colspan=\"1\">Test Dataset</th>"
table += "<th colspan=\"2\">XgbR</th><th colspan=\"2\">GBR</th><th colspan=\"2\">RFR</th><th colspan=\"2\">XgbC</th><th colspan=\"2\">GBC</th><th colspan=\"2\">RFC</th><th colspan=\"2\">Ripper</th><th colspan=\"2\">MLP C</th></tr>\n"
table += "</thead>\n<tbody>\n"
table += "<tr><td></td><td></td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td></tr>"

table = train_function(table, DATAPATH+f'df_val.csv', DATAPATH+f'df_val_test.csv')
table = train_function(table, DATAPATH+f'df_train.csv', DATAPATH+f'df_val.csv')


table += "</tbody></table>\n"
print(table)

In [None]:
##Training on data from a single era and testing on the same era
def train_function(table, train, test):
    df_train=pd.read_csv(train)
    df_test=pd.read_csv(test)
    trainf = df_train
    eras = trainf['era'].unique()
    for era in eras:
        
        ds = trainf.loc[trainf['era']==era]
        traindf, testdf = ds.iloc[0:int(0.8*ds.shape[0])], ds.iloc[int(0.8*ds.shape[0]:)]
        
        ds_train=MyDS(traindf.iloc[:,0:-3].values,traindf.iloc[:,-2].values,task='regression')
        ds_test=MyDS(testdf.iloc[:,0:-3].values,testdf.iloc[:,-2].values,task='regression')

        table += f"<tr><td>{train} {era}</td><td>{test} {era}</td>"
        
        rxf, train_acc, test_acc = train_xgbr(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
        rlf, train_acc, test_acc = train_gbr(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
        rfr, train_acc, test_acc = train_rfr(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

        ds_train=MyDS(trainf.iloc[:,0:-3].values,trainf.iloc[:,-2].values,task='classification')
        ds_test=MyDS(testf.iloc[:,0:-3].values,testf.iloc[:,-2].values,task='classification')
    
        cxf, train_acc, test_acc = train_xgbc(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
        clf, train_acc, test_acc = train_gbc(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
        rfc, train_acc, test_acc = train_rfc(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

        ripper, train_acc, test_acc = train_ripper(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"
    
        net, train_acc, test_acc = train_mlp(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.3f}</td><td>{test_acc:.3f}</td>"

        table += "</tr>\n"

    return table
from tqdm import tqdm
print("table")
table = "<table>\n"
table += "<thead><caption>Training on data from a single era and testing on the same era</caption>\n"
table += "<tr><th colspan=\"1\">Train Dataset</th><th colspan=\"1\">Test Dataset</th>"
table += "<th colspan=\"2\">XgbR</th><th colspan=\"2\">GBR</th><th colspan=\"2\">RFR</th><th colspan=\"2\">XgbC</th><th colspan=\"2\">GBC</th><th colspan=\"2\">RFC</th><th colspan=\"2\">Ripper</th><th colspan=\"2\">MLP C</th></tr>\n"
table += "</thead>\n<tbody>\n"
table += "<tr><td></td><td></td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td></tr>"

table = train_function(table, DATAPATH+f'df_train.csv', DATAPATH+f'df_test.csv')


table += "</tbody></table>\n"
print(table)

# Numerai data

In [None]:
#numerai data IMPORT
traindf = pd.read_parquet("494_v4_1_train.parquet")
traindf = traindf[traindf.columns[0:-36]]
valdf = pd.read_parquet("494_v4_1_validation.parquet")
train_eras = traindf['era'].unique()
val_eras = valdf['era'].unique()
traindf = traindf.loc[traindf['era'].isin(train_eras[-25:])]
valdf = valdf.dropna()
valdf = valdf[valdf.columns[:-36]]

In [None]:
traindf['target']

In [None]:
trainX, trainY = traindf[traindf.columns[2:-1]], traindf[traindf.columns[-1]]
valX, valY = valdf[valdf.columns[2:-1]], valdf[valdf.columns[-1]]
trainY = 4 * trainY
valY = 4 * valY

In [None]:
valX.shape

In [None]:
def train_function(table, traindf):
    num_samples = len(traindf)
    train_eras = traindf['era'].unique()
    trainf, testf= pd.DataFrame(), pd.DataFrame()
    for e in train_eras:
        rows = traindf[traindf['era'] == e].index.to_list()
        train_rows = rows[:int(0.8*len(rows))]
        test_rows = rows[int(0.8*len(rows)):]
        trainf = pd.concat((trainf, traindf.loc[train_rows]))
        testf = pd.concat((testf, traindf.loc[test_rows]))
        
    trainX, trainY = trainf[traindf.columns[2:-1]], trainf[traindf.columns[-1]]
    testX, testY = testf[traindf.columns[2:-1]], testf[traindf.columns[-1]]
    trainY = 4 * trainY
    testY = 4 * testY
    
    numFeatures = 25
    pca = PCA(n_components = numFeatures)
    trainX = pca.fit_transform(trainX)
    testX = pca.transform(testX)
    print("pca")
    ds_train=MyDS(trainX,trainY,task='regression')
    ds_test=MyDS(testX,testY,task='regression')
    
    rxf, train_acc, test_acc = train_xgbr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"

    rlf, train_acc, test_acc = train_gbr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"

    rfr, train_acc, test_acc = train_rfr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"
    print("reg")
    ds_train=MyDS(trainX,trainY,task='classification')
    ds_test=MyDS(testX,testY,task='classification')

    cxf, train_acc, test_acc = train_xgbc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"

    clf, train_acc, test_acc = train_gbc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"

    rfc, train_acc, test_acc = train_rfc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"

    ripper, train_acc, test_acc = train_ripper(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"

    net, train_acc, test_acc = train_mlp(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"

    table += "</tr>\n"

    return table    

from tqdm import tqdm
print("table")
table = "<table>\n"
table += "<thead><caption>Training on data from all eras and testing on data with all different eras.</caption>\n"
table += "<tr><th colspan=\"1\">Train Dataset</th><th colspan=\"1\">Test Dataset</th>"
table += "<th colspan=\"2\">XgbR</th><th colspan=\"2\">GBR</th><th colspan=\"2\">RFR</th><th colspan=\"2\">XgbC</th><th colspan=\"2\">GBC</th><th colspan=\"2\">RFC</th><th colspan=\"2\">Ripper</th><th colspan=\"2\">MLP C</th></tr>\n"
table += "</thead>\n<tbody>\n"
table += "<tr><td></td><td></td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td></tr>"
table += f"<tr><td colspan =\"2\">train</td>"
table = train_function(table, traindf)
table += "</tbody></table>\n"
print(table)

In [None]:
def train_function(table, trainX, trainY, testX, testY):
    ds_train=MyDS(trainX,trainY,task='regression')
    ds_test=MyDS(testX,testY,task='regression')
    
    rxf, train_acc, test_acc = train_xgbr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"

    rlf, train_acc, test_acc = train_gbr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"

    rfr, train_acc, test_acc = train_rfr(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"

    ds_train=MyDS(trainX,trainY,task='classification')
    ds_test=MyDS(testX,testY,task='classification')

    cxf, train_acc, test_acc = train_xgbc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"

    clf, train_acc, test_acc = train_gbc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"

    rfc, train_acc, test_acc = train_rfc(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"

    net, train_acc, test_acc = train_mlp(ds_train, ds_test, verbose = False)
    table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"

    table += "</tr>\n"

    return table
from tqdm import tqdm
print("table")
table = "<table>\n"
table += "<thead><caption>Training on data from all eras and testing on data with all different eras.</caption>\n"
table += "<tr><th colspan=\"1\">Train Dataset</th><th colspan=\"1\">Test Dataset</th>"
table += "<th colspan=\"2\">XgbR</th><th colspan=\"2\">GBR</th><th colspan=\"2\">RFR</th><th colspan=\"2\">XgbC</th><th colspan=\"2\">GBC</th><th colspan=\"2\">RFC</th><th colspan=\"2\">MLP C</th></tr>\n"
table += "</thead>\n<tbody>\n"
table += "<tr><td></td><td></td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td></tr>"
table += f"<tr><td>train</td><td>validation</td>"

table = train_function(table, trainX, trainY, valX, valY)


table += "</tbody></table>\n"
print(table)

In [None]:
def train_function(table, traindf):
    num_samples = len(traindf)
    eras = traindf['era'].unique()
    for era in eras:
        df= traindf[traindf['era'] == era]
        trainf = df.iloc[:int(0.8*len(df))]
        testf = df.iloc[int(0.8*len(df)):]
        
        trainX, trainY = trainf[traindf.columns[2:-1]].values, trainf[traindf.columns[-1]].values
        testX, testY = testf[traindf.columns[2:-1]].values, testf[traindf.columns[-1]].values
        trainY = 4 * trainY
        testY = 4 * testY
        
        numFeatures = 25
        pca = PCA(n_components = numFeatures)
        trainX = pca.fit_transform(trainX)
        testX = pca.transform(testX)
        print("pca")
        ds_train=MyDS(trainX,trainY,task='regression')
        ds_test=MyDS(testX,testY,task='regression')
        table += f"<tr><td colspan = \"2\">{era}</td>"
        rxf, train_acc, test_acc = train_xgbr(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"
    
        rlf, train_acc, test_acc = train_gbr(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"
    
        rfr, train_acc, test_acc = train_rfr(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"
        print("reg")
        ds_train=MyDS(trainX,trainY,task='classification')
        ds_test=MyDS(testX,testY,task='classification')
    
        cxf, train_acc, test_acc = train_xgbc(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"
    
        clf, train_acc, test_acc = train_gbc(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"
    
        rfc, train_acc, test_acc = train_rfc(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"
    
        net, train_acc, test_acc = train_mlp(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"
    
        table += "</tr>\n"

    return table    
from tqdm import tqdm
print("table")
table = "<table>\n"
table += "<thead><caption>Training on data from a single era and test on same era.</caption>\n"
table += "<tr><th colspan=\"1\">Train Dataset</th><th colspan=\"1\">Test Dataset</th>"
table += "<th colspan=\"2\">XgbR</th><th colspan=\"2\">GBR</th><th colspan=\"2\">RFR</th><th colspan=\"2\">XgbC</th><th colspan=\"2\">GBC</th><th colspan=\"2\">RFC</th><th colspan=\"2\">MLP C</th></tr>\n"
table += "</thead>\n<tbody>\n"
table += "<tr><td></td><td></td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td></tr>"

table = train_function(table, traindf)


table += "</tbody></table>\n"
print(table)
#test on single era train on same era

In [None]:
def train_function(table, traindf, testdf):
    num_samples = len(traindf)
    train_eras = traindf['era'].unique()
    test_eras = testdf['era'].unique()
    trainf, testf= pd.DataFrame(), pd.DataFrame()
    for train_era, test_era in zip(train_eras,test_eras):
        trainf = traindf[traindf['era'] == train_era]
        testf = testdf[testdf['era']==test_era]
        
        trainX, trainY = trainf[traindf.columns[2:-1]].values, trainf[traindf.columns[-1]].values
        testX, testY = testf[testdf.columns[2:-1]].values, testf[testdf.columns[-1]].values
        trainY = 4 * trainY
        testY = 4 * testY
        
        numFeatures = 25
        pca = PCA(n_components = numFeatures)
        trainX = pca.fit_transform(trainX)
        testX = pca.transform(testX)
        print("pca")
        ds_train=MyDS(trainX,trainY,task='regression')
        ds_test=MyDS(testX,testY,task='regression')
        table += f"<tr><td>{train_era}</td><td>{test_era}</td>"
        rxf, train_acc, test_acc = train_xgbr(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"
    
        rlf, train_acc, test_acc = train_gbr(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"
    
        rfr, train_acc, test_acc = train_rfr(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"
        print("reg")
        ds_train=MyDS(trainX,trainY,task='classification')
        ds_test=MyDS(testX,testY,task='classification')
    
        cxf, train_acc, test_acc = train_xgbc(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"
    
        clf, train_acc, test_acc = train_gbc(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"
    
        rfc, train_acc, test_acc = train_rfc(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"
    
        net, train_acc, test_acc = train_mlp(ds_train, ds_test, verbose = False)
        table += f"<td>{train_acc:.2f}</td><td>{test_acc:.2f}</td>"
    
        table += "</tr>\n"

    return table    
from tqdm import tqdm
print("table")
table = "<table>\n"
table += "<thead><caption>Training on data from a single era and test on different era.</caption>\n"
table += "<tr><th colspan=\"1\">Train Dataset</th><th colspan=\"1\">Test Dataset</th>"
table += "<th colspan=\"2\">XgbR</th><th colspan=\"2\">GBR</th><th colspan=\"2\">RFR</th><th colspan=\"2\">XgbC</th><th colspan=\"2\">GBC</th><th colspan=\"2\">RFC</th><th colspan=\"2\">MLP C</th></tr>\n"
table += "</thead>\n<tbody>\n"
table += "<tr><td></td><td></td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td><td>Train</td><td>Test</td></tr>"

table = train_function(table, traindf, valdf)


table += "</tbody></table>\n"
print(table)
#test on single era train on diff era