In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
from matplotlib import pyplot as plt
from pyarrow.parquet import ParquetFile
import pyarrow as pa
from tqdm.auto import tqdm

In [None]:
n = 100000
tr_data_path = os.path.join('data', 'numerai_training_data.parquet')
vl_data_path = os.path.join('data', 'numerai_validation_data.parquet')
tr_pf = ParquetFile(tr_data_path)
vl_pf = ParquetFile(vl_data_path)
tr_itr = tr_pf.iter_batches(batch_size=n)
vl_itr = vl_pf.iter_batches(batch_size=500)

MIN_ERA = 1
MAX_ERA = 1

In [None]:
rows = next(tr_itr)
df = pa.Table.from_batches([rows]).to_pandas()
df = df.astype({'era': 'uint16'})
df = df.drop('data_type', axis=1)

rows = next(vl_itr)
val_df = pa.Table.from_batches([rows]).to_pandas()
val_df = val_df.astype({'era': 'uint16'})
val_df = val_df.drop('data_type', axis=1)


In [None]:
MIN_ERA = MAX_ERA
MAX_ERA = df.era.max()
MIN_ERA, MAX_ERA

In [None]:
era_sizes = []
for i in range(MIN_ERA, MAX_ERA):
    n = df[df['era'].astype(int)==i].shape[0]
    era_sizes.append(n)

fig, ax = plt.subplots()
ax.set_xlabel('Era')
ax.set_ylabel('Number of entries')
plt.plot(range(MIN_ERA, MAX_ERA), era_sizes)

In [None]:
# cr = df.corr()

In [None]:
# top5={}
# for c in df.columns:
#     if 'target' not in c:
#         v = cr.loc[c,'target']
#         if len(top5) < 5:
#             top5[c] = v
#         else:
#             for k in top5:
#                 if v > top5[k]:
#                     top5.pop(k)
#                     top5[c] = v
#                     break
# print(top5)

In [None]:
# df[list(top5.keys()) + ['target']]

In [None]:
feature_cols = [i for i in df.columns if 'feature' in i]# + ['era']
target_cols = [i for i in df.columns if 'target' in i]

In [None]:
len(feature_cols)

In [None]:
f"{len(feature_cols)} + {len(target_cols)} = {len(df.columns)} {len(feature_cols) + len(target_cols) == len(df.columns)}"

In [None]:
import torch
import torch.nn as nn

In [None]:
df

In [None]:
val_df.columns

In [None]:
training_df = df[feature_cols]
training_target = df['target']

valid_df = val_df[feature_cols]
valid_target = val_df['target']

In [None]:
training_df

In [None]:
X = torch.tensor(training_df.values)
Y = torch.tensor(training_target.values)

X_val = torch.tensor(valid_df.values)
Y_val = torch.tensor(valid_target.values)

In [None]:
X.shape

In [None]:
Y.shape

In [None]:
# model.eval()

# for i in tqdm(range(X.shape[0])):
#     x, y = X[i].cuda(), Y[i].cuda()
#     pred = model(x)
#     loss = criterion(pred.squeeze(), y)
#     running_loss += loss.item()
# losses.append(running_loss/X.shape[0])

In [None]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.lin = nn.Sequential(
            nn.Linear(1050, 500),
            nn.ReLU(),
            nn.Linear(500, 125),
            nn.ReLU(),
            nn.Linear(125, 1),    
        )

    def forward(self, x):
        x = self.lin(x)
        return x

In [None]:
model = Net()
model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)#, weight_decay=0.1)
criterion = nn.MSELoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, verbose=True)


losses = []

for _ in range(1):
    running_loss = 0
    for i in tqdm(range(X.shape[0])):
        optimizer.zero_grad()
        x, y = X[i].cuda(), Y[i].cuda()
        pred = model(x)
        loss = criterion(pred.squeeze(), y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    losses.append(running_loss/X.shape[0])
    print(losses[-1])
    scheduler.step(losses[-1])


In [None]:
# torch.set_printoptions(threshold=10000)
# with open('params.m', 'w') as f:
#     for name, param in model.named_parameters():
#         print(name, param)

In [None]:
model.eval()

losses = []
with torch.no_grad():
    for i in tqdm(range(X_val.shape[0])):
        x, y = X_val[i].cuda(), Y_val[i].cuda()
        pred = model(x)
        loss = criterion(pred.squeeze(), y)
        losses.append(loss.item())

In [None]:
np.mean(losses)

In [None]:
plt.plot(range(len(losses)), losses)
plt.show()

In [None]:
sns.histplot(losses)

In [None]:
np.mean(losses)

In [None]:
targets = Y.cpu().detach().numpy()
K = []
for i in targets:
    k = i.as_integer_ratio()[1]
    K.append(k)

In [None]:
sns.histplot(K)