In [1]:
from concurrent.futures import ProcessPoolExecutor
from itertools import islice
import mmap
import os
from pathlib import Path

import polars as pl

import src.main.python.utils

In [2]:
# Hooray for local Python imports
pretty_print_solution = src.main.python.utils.pretty_print_solution
update_in_place = src.main.python.utils.update_in_place
batch_indices = src.main.python.utils.batch_indices
process_batch = src.main.python.utils.process_batch
consolidate_accumulators = src.main.python.utils.consolidate_accumulators
process_batch_from_indices = src.main.python.utils.process_batch_from_indices
process_batch_from_indices_mmap = src.main.python.utils.process_batch_from_indices_mmap
Accumulator = src.main.python.utils.Accumulator
PathLike = src.main.python.utils.PathLike

In [3]:
here = Path(os.path.abspath("")).resolve()
data_dir = here / "data"
n_lines = 1_000_000
data_path = data_dir / f"measurements_{n_lines}.txt"
data_path = data_dir / "measurements.txt"

# How fast can we iterate through the file?

Execution times on real data set:
- 2min 21s
- 1min 18s

In [4]:
%%time
with open(data_path) as fp:
    for line in fp:
        pass

CPU times: user 83.9 ms, sys: 37.1 ms, total: 121 ms
Wall time: 125 ms


In [5]:
%%time
with open(data_path, "r+b") as fp, mmap.mmap(fp.fileno(), 0) as mm:
    while (line := mm.readline()):
        pass

CPU times: user 46.2 ms, sys: 0 ns, total: 46.2 ms
Wall time: 53.4 ms


# Polars

Execution times on real data set:
- 53.7s

In [11]:
%%time
records = (
    pl.scan_csv(data_path, has_header=False, separator=";", new_columns=["station", "temperature"])
    .group_by("station")
    .agg(
        pl.col("temperature").min().name.suffix("_min"),
        pl.col("temperature").mean().name.suffix("_avg"), 
        pl.col("temperature").max().name.suffix("_max")
    )
    .sort(pl.col("station"))
    .collect(streaming=True)
    .to_dicts()
)
print(len(records))
pretty_print_solution({bytes(d["station"], "utf-8"): [d["temperature_min"], d["temperature_avg"], d["temperature_max"], 1] for d in records})

8920
{A Coruña=-99.9/-0.11912247864439159/99.9, Aarhus=-99.9/-0.14951429895806068/99.9, Aarsâl=-99.9/0.22981449999999984/99.9, Aartselaar=-99.9/0.32523953960187113/99.9, Aasiaat=-99.9/0.11573705776534075/99.9, Abaeté=-99.9/-0.07344627115082569/99.9, Abakaliki=-99.9/-0.13766748234916462/99.9, Abasolo=-99.9/-0.37161345125668344/99.9, Abbeville=-99.9/0.017868656716418463/99.9, Abbiategrasso=-99.9/0.14296439381870327/99.9, Abbigeri=-99.9/0.2879349603282073/99.9, Abdullahnagar=-99.9/-0.05839346752753485/99.9, Abdurahmoni Jomí=-99.9/0.07516243472714929/99.9, Abelardo Luz=-99.9/-0.2254991797046938/99.9, Abergavenny=-99.9/-0.33348954326082364/99.9, Abergele=-99.9/-0.2636271264876603/99.9, Abertawe=-99.9/-0.07055687026330673/99.9, Abertillery=-99.9/-0.23252246591464487/99.9, Abhwar=-99.9/0.0969644267194424/99.9, Abinsk=-99.9/0.09639398865143538/99.9, Abjīj=-99.9/-0.09321193527636297/99.9, Ablu=-99.9/0.21373520866216913/99.9, Aboso=-99.9/-0.4366198170762253/99.9, Abqaiq=-99.9/0.18925574589649982

# Naive solution

Execution times on real data set:
- 15min 4s
- 15min 57s

In [7]:
%%time
d = {}  # station_name => [min, avg, max, count]
with open(data_path, "rb") as fp:
    for line in fp:
        update_in_place(line, d)
# pretty_print_solution(d)

CPU times: user 530 ms, sys: 13.6 ms, total: 544 ms
Wall time: 585 ms


In [8]:
%%time
d = {}  # station_name => [min, avg, max, count]
with open(data_path, "r+b") as fp, mmap.mmap(fp.fileno(), 0) as mm:
    while (line := mm.readline()):
        update_in_place(line, d)
# pretty_print_solution(d)

CPU times: user 485 ms, sys: 0 ns, total: 485 ms
Wall time: 524 ms


# Batching

Execution time in real data set:
- 13min 9s

In [9]:
%%time
from typing import Iterator


def iter_process_batches(filepath: PathLike, indices: tuple[int]) -> Iterator[Accumulator]:
    with open(filepath, "rb") as fp:
        for start, end in zip(indices, indices[1:]):
            fp.seek(start)
            yield process_batch(fp.read(end - start))

indices = batch_indices(data_path, 8)
consolidated = consolidate_accumulators(iter_process_batches(data_path, indices))
# pretty_print_solution(consolidated)

CPU times: user 548 ms, sys: 10.2 ms, total: 559 ms
Wall time: 604 ms


# Parallelization

In [5]:
def batched(iterable, n):
    # batched('ABCDEFG', 3) --> ABC DEF G
    if n < 1:
        raise ValueError('n must be at least one')
    it = iter(iterable)
    while batch := tuple(islice(it, n)):
        yield batch

Execution time on real dataset:
- 3min 48s

In [6]:
%%time
indices = batch_indices(data_path, 1500)
index_tuples = zip(indices, indices[1:])
results = []
n_workers = 10
with ProcessPoolExecutor(max_workers=n_workers) as executor:
    for i, index_batch in enumerate(batched(index_tuples, n_workers)):
        # if i > 0: break
        result = executor.map(
            process_batch_from_indices,
            [(data_path, start, end) for start, end in index_batch],
        )
        results.extend(result)
final_result = consolidate_accumulators(results)
print(len(final_result))
pretty_print_solution(final_result)

8920
{A Coruña=-99.9/-0.11912247864439188/99.9, Aarhus=-99.9/-0.1495142989580607/99.9, Aarsâl=-99.9/0.2298145000000001/99.9, Aartselaar=-99.9/0.32523953960187213/99.9, Aasiaat=-99.9/0.1157370577653407/99.9, Abaeté=-99.9/-0.073446271150825/99.9, Abakaliki=-99.9/-0.13766748234916573/99.9, Abasolo=-99.9/-0.3716134512566832/99.9, Abbeville=-99.9/0.017868656716417863/99.9, Abbiategrasso=-99.9/0.14296439381870324/99.9, Abbigeri=-99.9/0.2879349603282079/99.9, Abdullahnagar=-99.9/-0.05839346752753511/99.9, Abdurahmoni Jomí=-99.9/0.07516243472714967/99.9, Abelardo Luz=-99.9/-0.22549917970469374/99.9, Abergavenny=-99.9/-0.33348954326082414/99.9, Abergele=-99.9/-0.263627126487661/99.9, Abertawe=-99.9/-0.07055687026330795/99.9, Abertillery=-99.9/-0.2325224659146449/99.9, Abhwar=-99.9/0.09696442671944173/99.9, Abinsk=-99.9/0.09639398865143556/99.9, Abjīj=-99.9/-0.0932119352763621/99.9, Ablu=-99.9/0.21373520866216927/99.9, Aboso=-99.9/-0.4366198170762256/99.9, Abqaiq=-99.9/0.18925574589649896/99.9, 

Execution time on real dataset:
- 3min 48s

In [7]:
%%time
indices = batch_indices(data_path, 1500)
index_tuples = zip(indices, indices[1:])
results = []
n_workers = 10
with (
        ProcessPoolExecutor(max_workers=n_workers) as executor,
        open(data_path, "r+b") as fp,
        mmap.mmap(fp.fileno(), 0) as mm
):
    for i, index_batch in enumerate(batched(index_tuples, n_workers)):
        # if i > 0: break
        result = executor.map(
            process_batch,
            [mm[start:(end-start)] for start, end in index_batch],
        )
        results.extend(result)
final_result = consolidate_accumulators(results)
print(len(final_result))
pretty_print_solution(final_result)

8920
{A Coruña=-92.0/-11.328000000000005/98.2, Aarhus=-99.6/8.045161290322582/98.1, Aarsâl=-98.6/-4.349624060150377/98.1, Aartselaar=-99.9/6.6696202531645525/99.0, Aasiaat=-99.3/-4.840875912408761/97.5, Abaeté=-94.6/-1.3254545454545443/99.8, Abakaliki=-97.8/-0.5322580645161281/90.3, Abasolo=-98.2/-1.8423728813559324/99.7, Abbeville=-99.0/-0.6014285714285705/98.0, Abbiategrasso=-95.1/9.432432432432435/98.1, Abbigeri=-99.1/-8.14864864864865/99.7, Abdullahnagar=-97.0/-2.3282051282051275/98.5, Abdurahmoni Jomí=-98.7/-10.81830985915493/97.9, Abelardo Luz=-99.1/-5.766265060240963/98.8, Abergavenny=-98.2/0.9785714285714276/99.5, Abergele=-94.3/-5.033870967741936/90.3, Abertawe=-94.4/3.1157894736842118/99.1, Abertillery=-99.4/-7.972058823529412/94.4, Abhwar=-96.7/-9.390540540540542/98.2, Abinsk=-96.3/-5.491891891891894/96.1, Abjīj=-98.2/1.8984375/99.0, Ablu=-99.8/-5.230263157894736/96.6, Aboso=-97.3/-6.068253968253968/94.1, Abqaiq=-95.9/12.916417910447759/99.6, Abra Pampa=-92.0/12.144444444444