In [None]:
# hide
%load_ext nb_black

<IPython.core.display.Javascript object>

In [None]:
# export
import math
import os
import time
import logging
from pathlib import Path
from typing import Collection

import prefect
import fastparquet
from spell.metrics import send_metric
from prefect import task, Flow
from prefect.engine.signals import SKIP
from prefect.tasks.shell import ShellTask

from neuralmusic.midi import parse_midi_file

log = logging.getLogger("data.etl")

In [None]:
# default_exp data.etl

<IPython.core.display.Javascript object>

# ETL to process MIDI files

> Turning a bunch of MIDI files into parquet data

This ETL takes a tar.gz'd file full of arbitrary midi files and outputs a bunch of parquet files containing neat dataframes with pitch, durations and velocity triplets for each song.

First we need some global variables and reporting tools to log the progress. This ETL may run locally or on Spell.

In [None]:
# export
total_songs = 0
malformed_songs = 0
valid_songs = 0
valid_rows = 0

started_at = None


def init_stats():
    """
    Resets reporting stats to zero.
    """
    global total_songs
    global valid_songs
    global valid_rows
    global malformed_songs
    global started_at

    total_songs = 0
    malformed_songs = 0
    valid_songs = 0
    valid_rows = 0

    started_at = time.time()


def report(logger):
    """
    Reports current metrics, either to Spell or to a logger.
    """
    elapsed = time.time() - started_at
    send_metric("Total Songs", total_songs)
    send_metric("Malformed Songs", malformed_songs)
    send_metric("Songs", valid_songs)
    send_metric("Total Songs / second", (total_songs / elapsed))
    send_metric("Rows / second", (valid_rows / elapsed))

## Untar'ing the file

The first step is to untar the file containing the MIDI files.

In [None]:
# export
@task
def untar_cmd(file_path: str, outdir: str) -> str:
    """
    Untars a .tar.gz file onto a directory (will create it if it does not exist).
    """
    if os.path.exists(outdir):
        raise SKIP("Output directory already exists.")
    return f"mkdir -p {outdir} && tar -zxf {file_path} -C {outdir}"


untar = ShellTask(name="untar_task")

## Partitioning the files in minibatches

Since the tar.gz file may contain a huge amount of MIDI files, we'll partition those files into minibatches that we can process in parallel.

In [None]:
# export
@task(skip_on_upstream_skip=False)
def partition_files(
    data_path: str, partition_size: int = 100, min_partitions: int = 4
) -> list:
    """
    Partitions the midi files in data_path into chunks.
    """
    midi_files = list(Path(data_path).glob("**/*.mid"))
    n = len(midi_files)
    if (n / partition_size) < min_partitions:
        partition_size = math.ceil(n / min_partitions)
    logger = prefect.context.get("logger")
    logger.info(
        f"Processing {n} MIDI files partitioned into groups of {partition_size}"
    )
    return [
        midi_files[i : i + partition_size]
        for i in range(0, len(midi_files), partition_size)
    ]

## Processing a minibatch

For each minibatch, we'll go through its MIDI files, parse them, and write them to a separate Parquet file.

In [None]:
# export
@task
def process_and_write(mini_batch: Collection[str], outdir: str) -> bytes:
    """
    Parses a mini batch of MIDI files and writes the results to a parquet file.
    The filename is determined by `map_index`. Returns the amount of notes it parsed.
    """
    frame_no = prefect.context.get("map_index")
    Path(outdir).mkdir(parents=True, exist_ok=True)
    outfile = f"{outdir}/out_{frame_no}.parq"

    logger = prefect.context.get("logger")

    should_append = False

    global total_songs
    global valid_songs
    global valid_rows
    global malformed_songs

    for file in mini_batch:
        df = parse_midi_file(file)
        if df is not None:
            valid_songs += 1
            valid_rows += len(df)

            fastparquet.write(outfile, df, compression="SNAPPY", append=should_append)
            del df
            should_append = True
        else:
            malformed_songs += 1
            logger.warning(f"[Minibatch {frame_no}] {file} could not be processed.")

        total_songs += 1
        report(logger)

    return outfile

## Merging the parquet files

Once we have all the minibatches in separate parquet files, merging them into a single dataset is trivial.

In [None]:
# export
@task
def combine_parquet_files(files: Collection[str]) -> None:
    """
    Combines N parquet files with the same schema into another one.
    """
    fastparquet.writer.merge(files, verify_schema=False)

## Putting everything together

Now we can build the ETL flow!

In [None]:
# export


def build_etl(cfg):
    """
    Builds the ETL flow.
    """
    assert (
        cfg.tar_gz_path or cfg.midi_path
    ), "Config not found: data.etl.tar_gz_path or data.etl.midi_path"
    assert cfg.outdir, "Config not found: data.etl.outdir"

    with Flow("Neuralmusic Data ETL") as flow:
        if cfg.tar_gz_path:
            tar_gz_path = Path(cfg.tar_gz_path).resolve()
            assert tar_gz_path.exists(), f"{tar_gz_path} does not exist"
            command = untar_cmd(str(tar_gz_path), "data")
            untarred = untar(command=command)
            midi_path = "data"
        else:
            assert (
                Path(cfg.midi_path).resolve().exists()
            ), f"{cfg.midi_path} does not exist"
            midi_path = cfg.midi_path

        mini_batches = partition_files(
            midi_path,
            partition_size=cfg.partition_size,
            upstream_tasks=([untarred] if cfg.tar_gz_path else []),
        )

        partitions = process_and_write.map(mini_batches, outdir=cfg.outdir)

        combine_parquet_files(partitions)

    return flow

## Testing the ETL

The ETL accepts a `tar.gz` file input containing MIDI files:

In [None]:
# test
from testing import test_eq, path
from omegaconf import OmegaConf
import fastparquet

tmp_path = "/tmp/neuralmusic_etl"

targz_path = path("data/midi.tar.gz")

dot_list = [f"tar_gz_path={targz_path}", f"outdir={tmp_path}", "partition_size=1"]
etl_cfg = OmegaConf.from_dotlist(dot_list)
flow = build_etl(etl_cfg)

init_stats()

started_at = time.time()
flow.run()

test_eq(4, total_songs)
test_eq(0, malformed_songs)
test_eq(4, valid_songs)
test_eq(4, valid_rows)


df = fastparquet.ParquetFile(tmp_path, verify=True).to_pandas()
test_eq(4, len(df))

# TODO: figure out order!
# test_eq(["7.11.2", "7", "7"], pitches[0:3])
# test_eq([1.75, 0.5, 0.5], durations[0:3])
# test_eq([110, 110, 110], velocities[0:3])

[2019-12-21 16:07:51,597] INFO - prefect.FlowRunner | Beginning Flow run for 'Neuralmusic Data ETL'
[2019-12-21 16:07:51,600] INFO - prefect.FlowRunner | Starting flow run.
[2019-12-21 16:07:51,626] INFO - prefect.TaskRunner | Task 'untar_cmd': Starting task run...
[2019-12-21 16:07:51,643] INFO - prefect.TaskRunner | Task 'untar_cmd': finished task run for task with final state: 'Skipped'
[2019-12-21 16:07:51,665] INFO - prefect.TaskRunner | Task 'untar_task': Starting task run...
[2019-12-21 16:07:51,677] INFO - prefect.TaskRunner | Task 'untar_task': finished task run for task with final state: 'Skipped'
[2019-12-21 16:07:51,699] INFO - prefect.TaskRunner | Task 'partition_files': Starting task run...
[2019-12-21 16:07:51,702] INFO - prefect.Task: partition_files | Processing 4 MIDI files partitioned into groups of 1
[2019-12-21 16:07:51,719] INFO - prefect.TaskRunner | Task 'partition_files': finished task run for task with final state: 'Success'
[2019-12-21 16:07:51,734] INFO - pr

It also accepts a path to a folder with MIDI files:

In [None]:
# test
from testing import test_eq, path
from omegaconf import OmegaConf
import fastparquet

tmp_path = "/tmp/neuralmusic_etl"

midi_path = path("data")

dot_list = [f"midi_path={midi_path}", f"outdir={tmp_path}", "partition_size=1"]
etl_cfg = OmegaConf.from_dotlist(dot_list)
flow = build_etl(etl_cfg)

init_stats()

started_at = time.time()
flow.run()

test_eq(4, total_songs)
test_eq(0, malformed_songs)
test_eq(4, valid_songs)
test_eq(4, valid_rows)


df = fastparquet.ParquetFile(tmp_path, verify=True).to_pandas()
test_eq(4, len(df))

# TODO: figure out order!
# test_eq(["7.11.2", "7", "7"], pitches[0:3])
# test_eq([1.75, 0.5, 0.5], durations[0:3])
# test_eq([110, 110, 110], velocities[0:3])

[2019-12-21 16:08:00,888] INFO - prefect.FlowRunner | Beginning Flow run for 'Neuralmusic Data ETL'
[2019-12-21 16:08:00,905] INFO - prefect.FlowRunner | Starting flow run.
[2019-12-21 16:08:00,926] INFO - prefect.TaskRunner | Task 'partition_files': Starting task run...
[2019-12-21 16:08:00,934] INFO - prefect.Task: partition_files | Processing 4 MIDI files partitioned into groups of 1
[2019-12-21 16:08:00,966] INFO - prefect.TaskRunner | Task 'partition_files': finished task run for task with final state: 'Success'
[2019-12-21 16:08:00,984] INFO - prefect.TaskRunner | Task 'process_and_write': Starting task run...
[2019-12-21 16:08:00,999] INFO - prefect.TaskRunner | Task 'process_and_write[0]': Starting task run...
[2019-12-21 16:08:01,764] INFO - prefect.TaskRunner | Task 'process_and_write[0]': finished task run for task with final state: 'Success'
[2019-12-21 16:08:01,772] INFO - prefect.TaskRunner | Task 'process_and_write[1]': Starting task run...
[2019-12-21 16:08:02,979] INFO