In [None]:
#hide
%load_ext autoreload
%autoreload 2
%load_ext nb_black
%load_ext lab_black

<IPython.core.display.Javascript object>

In [None]:
#default_exp download

<IPython.core.display.Javascript object>

# Download

> API details.

In [None]:
#hide
from nbdev.showdoc import *

<IPython.core.display.Javascript object>

In [None]:
#export
import os
import json
import shutil
from numerapi import NumerAPI, SignalsAPI
from pathlib import Path, PosixPath
from abc import ABC
from rich.tree import Tree
from rich.console import Console
from rich import print as rich_print

## 1. Base

`BaseDownloader` is a simple object which implements logic common to all downloaders.

To implement a new Downloader, you should inherit from `BaseDownloader` and be sure to implement at least `download_training_data` and `download_inference_data`.

In [None]:
#export
class BaseDownloader(ABC):
    """
    Abstract base class for downloaders.
    :param directory_path: Base directory where data will be saved.
    """
    def __init__(self, directory_path: str):
        self.dir = Path(directory_path)
        if not self.dir.is_dir():
            rich_print(f"No existing directory found at '[blue]{self.dir}[/blue]'. Creating directory...")
            self.dir.mkdir(parents=True, exist_ok=True)

    def download_training_data(self, *args, **kwargs):
        """ Download all necessary files needed for training. """
        raise NotImplementedError(f"No method for downloading training data is implemented in '{self.__class__.__name__}'")

    def download_inference_data(self, *args, **kwargs):
        """ Download minimal amount of files needed for weekly inference. """
        raise NotImplementedError(f"No method for downloading inference data is implemented in '{self.__class__.__name__}'.")

    def remove_base_directory(self):
        """ Remove download directory with all contents. """
        abs_path = self.dir.resolve()
        rich_print(f":warning: [red]Deleting directory for '{self.__class__.__name__}[/red]' :warning:\nPath: '{abs_path}'")
        shutil.rmtree(abs_path)

    def _append_folder(self, folder: str) -> Path:
        """
        Return base directory appended with 'folder'.
        Create directory if it does not exist.
        """
        dir = Path(self.dir / folder)
        dir.mkdir(parents=True, exist_ok=True)
        return dir

    @staticmethod
    def _load_json(file_path: str, verbose = False, *args, **kwargs) -> dict:
        """ Load JSON from file and return as dictionary. """
        with open(file_path) as json_file:
            json_data = json.load(json_file, *args, **kwargs)
        if verbose:
            rich_print(json_data)
        return json_data

    @property
    def get_all_files(self) -> list:
        """ Return all contents in directory. """
        return list(self.dir.iterdir())

    @property
    def is_empty(self) -> bool:
        """ Check if directory is empty."""
        return not bool(self.get_all_files)

    def __call__(self, *args, **kwargs):
        """
        The most common use case will be to get weekly inference data. So calling the class itself returns inference data.
        """
        self.download_inference_data(*args, **kwargs)

<IPython.core.display.Javascript object>

In [None]:
test_dir = "test_base_1234321234321/"

# Test building class
base_down = BaseDownloader(directory_path=test_dir)
assert isinstance(base_down.dir, PosixPath)
assert base_down.dir.is_dir()

# Test properties
(base_down.dir / "test.txt").write_text("test")
rich_print(f"Directory contents:\n{base_down.get_all_files}")
assert not base_down.is_empty

# Remove contents
base_down.remove_base_directory()
assert not os.path.exists(test_dir)

<IPython.core.display.Javascript object>

## 2. Numerai Classic

In [None]:
#export
class NumeraiClassicDownloader(BaseDownloader):
    """
    Downloading from NumerAPI for Numerai Classic data

    :param directory_path: Main folder to download data in.
    All *args, **kwargs will be passed to NumerAPI initialization.
    """
    def __init__(self, directory_path: str, *args, **kwargs):
        super(NumeraiClassicDownloader, self).__init__(directory_path=directory_path)
        self.napi = NumerAPI(*args, **kwargs)
        self.current_round = self.napi.get_current_round()

    def download_training_data(self, folder: str = "", version: int = 2, int8: bool = False):
        """
        Get Numerai classic training and validation data.
        :param folder: Specify folder to create folder within directory root. Saves in directory root by default.
        :param version: Numerai version (1=classic, 2=super massive dataset (parquet)
        :param int8: Integer version of data
        """
        dir = self._append_folder(folder)
        if int8:
            version_mapping = {1: ['numerai_training_data_int8.csv', 'numerai_validation_data_int8.csv'],
                               2: ['numerai_training_data_int8.parquet', 'numerai_validation_data_int8.parquet']
                               }
        else:
            version_mapping = {1: ['numerai_training_data.csv', 'numerai_validation_data.csv'],
                               2: ['numerai_training_data.parquet', 'numerai_validation_data.parquet']
                               }
        train_val_files = self._get_version_files(version_mapping, version)
        for file in train_val_files:
            self.download_single_dataset(filename=file,
                                         dest_path=str(dir.joinpath(file)))


    def download_inference_data(self, folder: str = "", version: int = 2, int8: bool = False, round_num: int = None):
        """
        Get Numerai classic inference data.
        :param folder: Specify folder to create folder within directory root. Saves in directory root by default.
        :param version: Numerai version (1=classic, 2=super massive dataset (parquet)
        :param int8: Integer version of data
        :param round_num: Numerai tournament round number. Downloads latest round by default.
        """
        dir = self._append_folder(folder)
        if int8:
            version_mapping = {1: ['numerai_tournament_data_int8.csv'],
                               2: ['numerai_tournament_data_int8.parquet']
                               }
        else:
            version_mapping = {1: ['numerai_tournament_data.csv'],
                               2: ['numerai_tournament_data.parquet']
                               }
        train_val_files = self._get_version_files(version_mapping, version)
        rich_print(f":file_folder: [green]Downloading inference data for round[/green] '{round_num if round_num else self.current_round}'.")
        for file in train_val_files:
            self.download_single_dataset(filename=file,
                                         dest_path=str(dir.joinpath(file)),
                                         round_num=round_num)

    def download_single_dataset(self, filename: str, dest_path: str, round_num: int = None):
        """
        Download one of the available datasets through NumerAPI.

        :param filename: Name as listed in NumerAPI (Check NumerAPI().list_datasets())
        :param dest_path: Full path where file will be saved.
        :param round_num: Numerai tournament round number. Downloads latest round by default.
        """
        assert filename in self.napi.list_datasets(), f"Dataset '{filename}' not available in NumerAPI. Available datasets are {self.napi.list_datasets()}."
        rich_print(f":file_folder: [green]Downloading[/green] '{filename}' :file_folder:")
        self.napi.download_dataset(filename=filename,
                                   dest_path=dest_path,
                                   round_num=round_num)


    def download_example_data(self, folder: str = "", version: int = 2, round_num: int = None):
        """
        Download all example prediction data in specified folder for given version.

        :param folder: Specify folder to create folder within directory root. Saves in directory root by default.
        :param version: Numerai version (1=classic, 2=super massive dataset (parquet)
        :param round_num: Numerai tournament round number. Downloads latest round by default.
        """
        dir = self._append_folder(folder)
        version_mapping = {1: ['example_predictions.csv', 'example_validation_predictions.csv'],
                           2: ['example_predictions.parquet', 'example_validation_predictions.parquet']
                           }
        example_files = self._get_version_files(version_mapping, version)
        for file in example_files:
            self.download_single_dataset(filename=file,
                                         dest_path=str(dir.joinpath(file)),
                                         round_num=round_num)

    def get_classic_features(self, folder: str = "", *args, **kwargs) -> dict:
        """
        Download feature overview (stats and feature sets) through NumerAPI and load.
        :param folder: Specify folder to create folder within directory root. Saves in directory root by default.
        *args, **kwargs will be passed to the JSON loader
        """
        dir = self._append_folder(folder)
        filename = "features.json"
        dest_path = str(dir.joinpath(filename))
        self.download_single_dataset(filename=filename,
                                     dest_path=dest_path)
        # Load in json
        json_data = self._load_json(dest_path, *args, **kwargs)
        return json_data

    @staticmethod
    def _get_version_files(version_mapping: dict, version: int) -> list:
        """ Check if version is supported and return files corresponding to version mapping """
        try:
            files = version_mapping[version]
        except KeyError:
            raise NotImplementedError(f"Version '{version}' is not implemented. Available versions are {list(version_mapping.keys())}")
        return files

<IPython.core.display.Javascript object>

## Example usage

In [None]:
#hide_input
show_doc(NumeraiClassicDownloader.download_training_data)
show_doc(NumeraiClassicDownloader.download_inference_data)

<h4 id="NumeraiClassicDownloader.download_training_data" class="doc_header"><code>NumeraiClassicDownloader.download_training_data</code><a href="__main__.py#L14" class="source_link" style="float:right">[source]</a></h4>

> <code>NumeraiClassicDownloader.download_training_data</code>(**`folder`**:`str`=*`''`*, **`version`**:`int`=*`2`*, **`int8`**:`bool`=*`False`*, **\*`args`**, **\*\*`kwargs`**)

Get Numerai classic training and validation data.
:param folder: Specify folder to create folder within directory root. Saves in directory root by default.
:param version: Numerai version (1=classic, 2=super massive dataset (parquet)
:param int8: Integer version of data
*args, **kwargs are passed to NumerAPI downloader

<h4 id="NumeraiClassicDownloader.download_inference_data" class="doc_header"><code>NumeraiClassicDownloader.download_inference_data</code><a href="__main__.py#L38" class="source_link" style="float:right">[source]</a></h4>

> <code>NumeraiClassicDownloader.download_inference_data</code>(**`folder`**:`str`=*`''`*, **`version`**:`int`=*`2`*, **`int8`**:`bool`=*`False`*, **`round_num`**:`int`=*`None`*)

Get Numerai classic inference data.
:param folder: Specify folder to create folder within directory root. Saves in directory root by default.
:param version: Numerai version (1=classic, 2=super massive dataset (parquet)
:param int8: Integer version of data
:param round_num: Numerai tournament round number. Downloads latest round by default.

<IPython.core.display.Javascript object>

### Training

In [None]:
#slow
# initialization
train_base_directory = "test_numerai_classic_train_1234321"
# numer_classic_downloader = NumeraiClassicDownloader(train_base_directory)

# Download training and validation data
# numer_classic_downloader.download_training_data("train_val", version=2, int8=False)

# Save example predictions in new folder and get feature overview
# numer_classic_downloader.download_example_data("example")
# feature_stats = numer_classic_downloader.get_classic_features()

<IPython.core.display.Javascript object>

In [None]:
#hide
# Remove contents (for tests)
# numer_classic_downloader.remove_base_directory()

__For the training example the directory structure will be:__

In [None]:
#hide_input
console = Console(record=True, width=100)

tree = Tree(f":file_folder: {train_base_directory} (base_directory)", guide_style="bold bright_black")
folder_tree = tree.add(":page_facing_up: features.json")
train_val_tree = tree.add(":file_folder: train_val")
train_val_tree.add(':page_facing_up: numerai_training_data.parquet')
train_val_tree.add(':page_facing_up: numerai_validation_data.parquet')
example_tree = tree.add(":file_folder: example")
example_tree.add(':page_facing_up: example_predictions.parquet')
example_tree.add(':page_facing_up: example_validation_predictions.parquet')

console.print(tree)

<IPython.core.display.Javascript object>

### Inference

In [None]:
#slow
# initialization
inference_base_directory = "test_numerai_classic_inference_1234321"
# numer_classic_downloader = NumeraiClassicDownloader(inference_base_directory)

# Download tournament (inference) data
# numer_classic_downloader.download_inference_data("inference", version=2, int8=False)

# Remove folder when done with inference
# numer_classic_downloader.remove_base_directory()

__For the inference example the directory structure will be:__

In [None]:
#hide_input
console = Console(record=True, width=100)

tree = Tree(f":file_folder: {inference_base_directory} (base_directory)", guide_style="bold bright_black")
inference_tree = tree.add(":file_folder: inference")
inference_tree.add(':page_facing_up: numerai_tournament_data.parquet')

console.print(tree)

### Additional tests

In [None]:
#slow
test_dir_classic = "test_numerai_classic_1234321"
numer_classic_downloader = NumeraiClassicDownloader(test_dir_classic)

# Downloading example data
numer_classic_downloader.download_example_data("test1/", version=1)
numer_classic_downloader.download_example_data("test2/", version=2, round_num=290)

# Features
feature_stats_test = numer_classic_downloader.get_classic_features()
assert isinstance(feature_stats_test, dict)

# Remove contents
numer_classic_downloader.remove_base_directory()
assert not os.path.exists(test_dir_classic)

2022-01-04 22:44:19,536 INFO numerapi.utils: starting download
test_numerai_classic_1234321/test1/example_predictions.csv: 51.2MB [01:43, 495kB/s]                             


2022-01-04 22:46:04,826 INFO numerapi.utils: starting download
test_numerai_classic_1234321/test1/example_validation_predictions.csv: 19.6MB [00:05, 3.85MB/s]                            


2022-01-04 22:46:11,597 INFO numerapi.utils: starting download
test_numerai_classic_1234321/test2/example_predictions.parquet: 33.5MB [01:35, 352kB/s]                             


2022-01-04 22:47:48,664 INFO numerapi.utils: starting download
test_numerai_classic_1234321/test2/example_validation_predictions.parquet: 13.0MB [00:28, 451kB/s]                             


2022-01-04 22:48:19,437 INFO numerapi.utils: starting download
test_numerai_classic_1234321/features.json: 441kB [00:00, 471kB/s]                             


<IPython.core.display.Javascript object>

## 3. Yahoo Finance

<IPython.core.display.Javascript object>

## 4. FinnHub

<IPython.core.display.Javascript object>

## 5. Bloomberg?

<IPython.core.display.Javascript object>

In [None]:
#hide
# Run this cell to sync all changes with library
from nbdev.export import notebook2script; notebook2script()

Converted 00_download.ipynb.
Converted 01_dataloaders.ipynb.
Converted 02_dataset.ipynb.
Converted 03_preprocessing.ipynb.
Converted 04a_model.ipynb.
Converted 04b_modelpipeline.ipynb.
Converted 05_postprocessing.ipynb.
Converted 06_prediction_dataset.ipynb.
Converted 07_evaluation.ipynb.
Converted 08_auth.ipynb.
Converted 09_submission.ipynb.
Converted index.ipynb.


<IPython.core.display.Javascript object>