<a href="https://colab.research.google.com/github/djliden/numerai/blob/main/notebooks/fastai_tabular.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tabular Learner with Fastai

The goal of this notebook is to implement a tabular deep learning model using fastai and submit to this week's competition. I eventually plan to build this out into a more structured and modular PyTorch project, but for now I want to make sure that this works and will generate reasonable results.

There's a lot to explore -- such as choice of loss function and evaluation metri [link](https://forum.numer.ai/t/model-evaluation-metrics/337) -- but we're keeping it simple for now.

## Dependencies and Data Download

In [1]:
# install
!pip install --upgrade python-dotenv fastai numerapi

# import dependencies
import os
from dotenv import load_dotenv, find_dotenv
import pandas as pd
import numpy as np
import numerapi
from fastai.tabular.all import *

# Secrets setup
dotenv_path = find_dotenv()
load_dotenv(dotenv_path)
public_key = os.environ.get("NUMERAI_PUBLIC_KEY")
private_key = os.environ.get("NUMERAI_PRIVATE_KEY")

napi = numerapi.NumerAPI(verbosity="info", public_id=public_key, secret_key=private_key)
napi.download_current_dataset(dest_path="../input/", unzip=True)

Collecting numerapi
  Downloading numerapi-2.4.5-py3-none-any.whl (25 kB)
Collecting torchvision<0.9,>=0.8
  Using cached torchvision-0.8.2-cp39-cp39-manylinux1_x86_64.whl (12.7 MB)
Collecting torch<1.8,>=1.7.0
  Using cached torch-1.7.1-cp39-cp39-manylinux1_x86_64.whl (776.8 MB)
^C
[31mERROR: Operation cancelled by user[0m


KeyboardInterrupt: 

## Data Setup

In [None]:
filename = "../input/numerai_dataset_252/numerai_tournament_data.csv"
chunksize = 100000
iter_csv = pd.read_csv(filename, iterator=True, chunksize=chunksize)
val_df = pd.concat([chunk[chunk['data_type'] == 'validation'] for chunk in iter_csv])
val_df.head()

In [None]:
training_data = pd.read_csv("../input/numerai_dataset_252/numerai_training_data.csv")
training_data.head()

In [None]:
training_data = pd.concat([training_data, val_df])
training_data.reset_index(drop=True, inplace=True)

In [None]:
feature_cols = training_data.columns[training_data.columns.str.startswith('feature')]
target_cols = ['target']

In [None]:
train_idx, test_idx = training_data.index[training_data.data_type=='train'].tolist(),training_data.index[training_data.data_type=='validation'].tolist()
splits = (list(train_idx), list(test_idx))
#train_idx, test_idx

In [None]:
categorical = ['era']
data = TabularPandas(training_data, cat_names=None,
                    cont_names=list(feature_cols.values),
                    y_names=target_cols, splits = splits)
len(data.train), len(data.valid)

In [None]:
dls = data.dataloaders()

In [None]:
learn = tabular_learner(dls, layers=[200,100],
                        loss_func=MSELossFlat(),
                        metrics = [PearsonCorrCoef()])
#learn.lr_find()

In [None]:
learn.fit_one_cycle(1, wd = 2)

In [None]:
learn.recorder.plot_loss(skip_start = 500)

In [None]:
from scipy.stats import spearmanr
  
def sharpe(df: pd.DataFrame) -> np.float32:
    """
    Calculate the Sharpe ratio by using grouped per-era data
    :param df: A Pandas DataFrame containing the columns "era", "target" and "prediction"
    :return: The Sharpe ratio for your predictions.
    """
    def _score(sub_df: pd.DataFrame) -> np.float32:
        """ Calculate Spearman correlation for Pandas' apply method """
        return spearmanr(sub_df["target"],  sub_df["prediction"])[0]
    corrs = df.groupby("era").apply(_score)
    return corrs.mean() / corrs.std()

# Would like to make this a metric that can be tracked with
# fastai but am not sure how.
prediction, target = learn.get_preds()
prediction = prediction.numpy().squeeze()
target = target.numpy().squeeze()
prediction, target

era = dls.valid_ds.items['era']
eval_df = pd.DataFrame({'prediction':prediction, 'target':target, 'era':era}).reset_index()
spearmanr(eval_df.target, eval_df.prediction)

sharpe(eval_df)

In [None]:
class FastSubmission:
  '''Class for generating numerai submissions from fastai learners'''

  def __init__(self, dls, learner, filename, #test_features,
               chunk:bool = True, chunksize:int = 60000,
               debug:bool = False, numerapi=napi,
               outpath = Path("../output/")):
    self.dls = dls
    self.learn = learner
    self.chunk = True
    self.chunksize = chunksize
    #self.test_features = test_features
    self.filename = filename
    self.debug = debug
    self.napi = numerapi
    self.outpath = outpath

  def get_preds_and_ids(self, data_subset):
    data_subset.drop(columns = 'target', inplace = True)
    test_dl = self.dls.test_dl(data_subset) #[self.test_features]
    preds_out,_ = self.learn.get_preds(dl = test_dl, inner = True)
    preds_out = preds_out.tolist()
    preds_out = [item for sublist in preds_out for item in sublist]

    ids_out = data_subset["id"]
    return(preds_out, ids_out)
  
  def get_predictions(self):
    out_list = []
    if self.chunk:
      iter_csv = pd.read_csv(self.filename, iterator=True,
                             chunksize=self.chunksize)
      out_list.extend(self.get_preds_and_ids(x) for x in iter_csv)
    else:
      pred_data = pd.read_csv(self.filename)
      out_list.extend(self.get_preds_and_ids(pred_data))
    
    preds = []
    ids = []
    preds.extend(x[0] for x in out_list)
    preds_out = [item for sublist in preds for item in sublist]
    ids.extend(x[1] for x in out_list)
    ids_out = [item for sublist in ids for item in sublist]

    predictions_df = pd.DataFrame({
    'id':ids_out,
    'prediction_kazutsugi':preds_out
    }) 

    self.predictions = predictions_df
    return(predictions_df)
  
  def save_predictions(self):
    try:
      self.predictions
    except AttributeError:
      print("No predictions to save.\nRemember to run get_predictions first")
    else:
      print("Saving Predictions...\n")
      if ~((self.outpath).exists()):
        self.outpath.mkdir()
      self.predictions.to_csv(self.outpath/ "predictions.csv", index=False)


  def submit(self):
    try:
      self.predictions
    except AttributeError:
      print("No predictions to submit.\nRemember to run get_predictions first!")
    else:
      if ~((self.outpath / "predictions.csv").exists()):
        self.save_predictions()
      print("Submitting Predictions...\n")
      self.napi.upload_predictions(self.outpath / "predictions.csv",
                              model_id=os.environ.get("NUMERAI_MODEL_ID"))

In [None]:
subtest = FastSubmission(dls = dls, learner=learn, chunk=True,
                         chunksize = 100000,
                         filename = "../input/numerai_dataset_252/numerai_tournament_data.csv")

In [None]:
subtest.get_predictions()

In [None]:
subtest.submit()

In [None]:
predictions_df = pd.DataFrame({
    'id':ids_out,
    'prediction_kazutsugi':preds_out
})
predictions_df.head()

In [None]:
data = pd.DataFrame(testsub, columns =['prediction_kazutsugi', 'ids'])
data.head()
#len(testsub[0][1])

In [None]:
ids = []
preds = []

filename = "../input/numerai_dataset_252/numerai_tournament_data.csv"
chunksize = 100000
with pd.read_csv(filename, chunksize=chunksize) as reader:
    for chunk in reader:
        chunk.target = None
        test_dl = dls.test_dl(chunk[feature_cols])
        out,_ = learn.get_preds(dl=test_dl, inner=True)
        #test_df = chunk[feature_cols]
        #preds = learn.pred_batch(test_df)
        out = out.tolist()
        ids.extend(chunk["id"])
        preds.extend(out)

In [None]:
len(preds)

In [None]:
preds = [item for sublist in preds for item in sublist]

In [None]:
preds[0:10], ids[0:10], len(preds), len(ids)

In [None]:
# predictions must have an `id` column and a `prediction_kazutsugi` column
predictions_df = pd.DataFrame({
    'id':ids,
    'prediction_kazutsugi':preds
})
predictions_df.head()

In [None]:
predictions_df.to_csv("../output/predictions.csv", index=False)
# Upload predictions
submission_id = napi.upload_predictions("../output/predictions.csv",
                                        model_id=os.environ.get("NUMERAI_MODEL_ID"))