# Faking 10 Files

1. Always build with 10 files output chunks
1. If the actual number is less than 10, then have zero items in some of the chunks
1. If more than 10 files, try to combine them with uproot5 and dask or similar.

## Faker

Initialize with the proper number of files we'll eventually generate. Mostly copied from the `with_histo` notebook.

In [41]:
from typing import List
import awkward as ak
import dask_awkward as dak
import random
import dask

def make_input_layer(name: str, inputs: List[str], npartitions=10):
    '''Create an AwkwardInput layer with `inputs` chunks. Each chunk
    has some random numbers in it (100 of them). This will always generate
    10 partitions. If `len(inputs)` is less than 10, then some of the partitions
    will be empty. If `len(inputs)` is greater than 10, then we will
    combine some of them.

    Args:
        name (str): Name of the input layer (for the graph)
        inputs (List[str]): Names of each partition

    Returns:
        AwkwardInputLayer: Input Layer
    '''
    def generate_data(block):
        '''Generate the data for a particular partition. We have the job of
        figuring out how many files per partition.

        Args:
            block (int): The block number
        '''
        if len(inputs) <= npartitions:
            if block < len(inputs):
                return generate_partition_data(inputs[block])
            else:
                return ak.from_iter([])
        else:
            num_per = int(len(inputs) / npartitions)
            if block < (len(inputs) - num_per*npartitions):
                num_per += 1
            
            block_index = num_per * block
            data = [generate_partition_data(inputs[block_index + i]) for i in range(0, num_per)]
            return ak.concatenate(data)

    def generate_partition_data(block):
        '''Generate the partition data for a single block.

        Args:
            block (_type_): _description_

        Returns:
            _type_: _description_
        '''
        print(f'In generate_data: {block}')
        return ak.from_iter([random.uniform(0, 10) for i in range(0, 100)])

    # Build the metadata for this array we will be returning. Each partition
    # will be of this form.
    sample_array = ak.from_iter([1.0, 2.1, 3.2, 4.3, 5.4])
    metadata = dak.core.typetracer_array(sample_array)

    # Next, create the input layer that will be used to generate the data.
    # Always setup 10 partitions
    dsk = dak.layers.AwkwardInputLayer(
            name=name,
            columns=None,
            inputs=list(range(0, npartitions)),
            io_func=generate_data,
            meta=metadata,
            behavior=None,
        )

    return dsk

def generate_sx_daq(query: str, inputs: List[str] = ['0', '1'], n_partitions = 10) -> dak.Array:
    name = 'unique-name'
    input_layer = make_input_layer(name, inputs, npartitions=n_partitions)

    # Create the high level graph that will hold all of this, and the actual array object
    hlg = dask.highlevelgraph.HighLevelGraph.from_collections(name, input_layer)
    ar = dak.core.new_array_object(hlg, name, meta=input_layer._meta, npartitions=n_partitions)

    return ar

## Make a histogram with exactly 10 partitions

This is the easy case!

In [42]:
import dask_histogram as dh
import mplhep as hep

x = generate_sx_daq("(query)", inputs=[f'b_{i}' for i in range(0, 10)])
h = dh.factory(x, axes=(dh.axis.Regular(20, 0, 10),))
r = h.compute()
#_ = hep.histplot(r)

assert r.sum() == 1000

In generate_data: b_0
In generate_data: b_1
In generate_data: b_2
In generate_data: b_3
In generate_data: b_4
In generate_data: b_5
In generate_data: b_6
In generate_data: b_7
In generate_data: b_8
In generate_data: b_9


## Make a histogram with 5 partitions

So 5 of them should be empty.

In [43]:
import dask_histogram as dh
import mplhep as hep

x = generate_sx_daq("(query)", inputs=[f'b_{i}' for i in range(0, 5)])
h = dh.factory(x, axes=(dh.axis.Regular(20, 0, 10),))
r = h.compute()
#_ = hep.histplot(r)

assert r.sum() == 500

In generate_data: b_0In generate_data: b_1
In generate_data: b_2
In generate_data: b_3
In generate_data: b_4



## Make a histogram with 15 partitions

This means that some them should have more than one partition in them.

In [45]:
import dask_histogram as dh
import mplhep as hep

x = generate_sx_daq("(query)", inputs=[f'b_{i}' for i in range(0, 15)])
h = dh.factory(x, axes=(dh.axis.Regular(20, 0, 10),))
r = h.compute()

assert r.sum() == 1500

In generate_data: b_0In generate_data: b_2
In generate_data: b_3
In generate_data: b_4
In generate_data: b_5
In generate_data: b_6
In generate_data: b_7
In generate_data: b_8
In generate_data: b_9
In generate_data: b_5
In generate_data: b_6
In generate_data: b_7
In generate_data: b_8
In generate_data: b_9

In generate_data: b_1
