In [1]:
import mmap
import os
from pathlib import Path

import polars as pl

import src.main.python.utils

In [2]:
# Hooray for local Python imports
pretty_print_solution = src.main.python.utils.pretty_print_solution
update_in_place = src.main.python.utils.update_in_place
batch_indices = src.main.python.utils.batch_indices
process_batch = src.main.python.utils.process_batch
consolidate_accumulators = src.main.python.utils.consolidate_accumulators
process_batch_from_indices = src.main.python.utils.process_batch_from_indices
Accumulator = src.main.python.utils.Accumulator
PathLike = src.main.python.utils.PathLike

In [3]:
here = Path(os.path.abspath("")).resolve()
data_dir = here / "data"
n_lines = 1_000_000
data_path = data_dir / f"measurements_{n_lines}.txt"
# data_path = data_dir / "measurements.txt"

# How fast can we iterate through the file?

Execution times on real data set:
- 2min 21s
- 1min 18s

In [4]:
%%time
with open(data_path) as fp:
    for line in fp:
        pass

CPU times: user 83.9 ms, sys: 37.1 ms, total: 121 ms
Wall time: 125 ms


In [5]:
%%time
with open(data_path, "r+b") as fp, mmap.mmap(fp.fileno(), 0) as mm:
    while (line := mm.readline()):
        pass

CPU times: user 46.2 ms, sys: 0 ns, total: 46.2 ms
Wall time: 53.4 ms


# Polars

Execution times on real data set:
- 53.7s

In [6]:
%%time
records = (
    pl.scan_csv(data_path, has_header=False, separator=";", new_columns=["station", "temperature"])
    .group_by("station")
    .agg(
        pl.col("temperature").min().name.suffix("_min"),
        pl.col("temperature").mean().name.suffix("_avg"), 
        pl.col("temperature").max().name.suffix("_max")
    )
    .sort(pl.col("station"))
    .collect(streaming=True)
    .to_dicts()
)
#pretty_print_solution({d["station"]: [d["temperature_min"], d["temperature_avg"], d["temperature_max"], 1] for d in records})

CPU times: user 295 ms, sys: 89.9 ms, total: 385 ms
Wall time: 83.6 ms


# Naive solution

Execution times on real data set:
- 15min 4s
- 15min 57s

In [7]:
%%time
d = {}  # station_name => [min, avg, max, count]
with open(data_path, "rb") as fp:
    for line in fp:
        update_in_place(line, d)
# pretty_print_solution(d)

CPU times: user 530 ms, sys: 13.6 ms, total: 544 ms
Wall time: 585 ms


In [8]:
%%time
d = {}  # station_name => [min, avg, max, count]
with open(data_path, "r+b") as fp, mmap.mmap(fp.fileno(), 0) as mm:
    while (line := mm.readline()):
        update_in_place(line, d)
# pretty_print_solution(d)

CPU times: user 485 ms, sys: 0 ns, total: 485 ms
Wall time: 524 ms


# Batching

Execution time in real data set:
- 13min 9s

In [9]:
%%time
from typing import Iterator


def iter_process_batches(filepath: PathLike, indices: tuple[int]) -> Iterator[Accumulator]:
    with open(filepath, "rb") as fp:
        for start, end in zip(indices, indices[1:]):
            fp.seek(start)
            yield process_batch(fp.read(end - start))

indices = batch_indices(data_path, 8)
consolidated = consolidate_accumulators(iter_process_batches(data_path, indices))
# pretty_print_solution(consolidated)

CPU times: user 548 ms, sys: 10.2 ms, total: 559 ms
Wall time: 604 ms


# Parallelization

Execution time on real dataset:
- 14min 2s

In [10]:
%%time
if __name__ == "__main__":
    indices = batch_indices(data_path, 8)
    with ProcessPoolExecutor(max_workers=8) as executor:
        results = executor.map(
            process_batch_from_indices, 
            [(data_path, start, end) for start, end in zip(indices, indices[1:])]
        )
    result = consolidate_accumulators(*results)
    # pretty_print_solution(result)

NameError: name 'ProcessPoolExecutor' is not defined