In [1]:
!pip install -q webdataset

In [2]:
import os
import json
import multiprocessing
from pathlib import Path

import polars as pl
import webdataset as wds
from tqdm import tqdm
import joblib
import pandas as pd
from webdataset import TarWriter

from graphnet.data.constants import FEATURES, TRUTH


[1;34mgraphnet[0m: [32mINFO    [0m 2023-03-14 20:48:45 - get_logger - Writing log to [1mlogs/graphnet_20230314-204845.log[0m


In [3]:
meta_data_path = Path("../../raw/icecube-neutrinos-in-deep-ice/train_meta.parquet")
geometry_path = Path("../../raw/icecube-neutrinos-in-deep-ice/sensor_geometry.csv")
input_data_path = "../../raw/icecube-neutrinos-in-deep-ice/train/batch_{batch_id}.parquet"
shard_dir = Path("./webdatasets")
shard_dir.mkdir(exist_ok=True)

shard_filename = str(shard_dir / 'shards-%04d.tar')

shard_size = int(1 * 1024**3)

In [3]:
meta_data = pl.read_parquet(meta_data_path)
geometry_table = pl.read_csv(geometry_path, dtypes={"sensor_id": pl.Int16})

In [4]:
def get_features_truth(event_id):
    truth = meta_data_batch.filter(pl.col("event_id") == event_id)[TRUTH.KAGGLE].to_numpy()
    features = (
        df_batch.filter(pl.col("event_id") == event_id)
            .join(geometry_table, on="sensor_id", how="left")[FEATURES.KAGGLE]
            .to_numpy()
    )
    return features, truth

In [5]:
def write_one_batch(batch_id, meta_data, geometry_table,):
    pattern = "webdatasets/batch-%03d.tar"
    print(f"Start saving batch {batch_id}")
    meta_data_batch = meta_data[meta_data.batch_id == batch_id]
    event_ids = meta_data_batch["event_id"].unique()
    df_batch = pd.read_parquet(input_data_path.format(batch_id=batch_id))
    df_batch[["x", "y", "z"]] = geometry_table.loc[df_batch.sensor_id.values, ["x", "y", "z"]].values

    fname = pattern % batch_id
    stream = TarWriter(fname)
    
    
    for event_id in tqdm(event_ids, desc=f"Batch {batch_id}"):
        df_event = df_batch[df_batch.index == event_id].copy()
        write_samples_into_single_shard(stream, meta_data_batch, event_id, df_event)

    print(f"Finished saving batch {batch_id}")
    stream.close()


def write_samples_into_single_shard(stream, meta_data_batch, event_id, df_batch):
    truth = meta_data_batch[meta_data_batch.event_id == event_id][TRUTH.KAGGLE].values
    features = df_batch.loc[event_id, FEATURES.KAGGLE].values

    data = {
            "__key__": str(event_id),
            "pickle": (
                features, truth,
            )
        }

    size = stream.write(data)
    return size

In [None]:

with wds.ShardWriter(shard_filename, maxsize=shard_size,) as sink:
    for batch_id in batch_ids:
        meta_data_batch = meta_data.filter(pl.col("batch_id") == batch_id)
        event_ids = meta_data_batch["event_id"].unique()
        df_batch = pl.read_parquet(input_data_path.format(batch_id=batch_id))

        for event_id in tqdm(event_ids, desc="fBatch {batch_id}"):
            features, truth = get_features_truth(event_id)
            sink.write({
                "__key__": str(event_id),
                "pickle": (
                    features, truth,
                )
            })
            
        del df_batch, meta_data_batch


In [7]:
batch_ids = range(101, 151)

meta_data = pd.read_parquet(meta_data_path)
geometry_table = pd.read_csv(geometry_path) # dtypes={"sensor_id": pl.Int16}
geometry_table = geometry_table.set_index("sensor_id")


In [8]:
for batch_id in batch_ids:
    write_one_batch(batch_id, meta_data, geometry_table)

Start saving batch 101


Batch 101: 100%|██████████████████████████████████████████| 200000/200000 [1:39:29<00:00, 33.50it/s]


Finished saving batch 101
Start saving batch 102


Batch 102: 100%|██████████████████████████████████████████| 200000/200000 [1:40:49<00:00, 33.06it/s]


Finished saving batch 102
Start saving batch 103


Batch 103: 100%|██████████████████████████████████████████| 200000/200000 [1:43:57<00:00, 32.06it/s]


Finished saving batch 103
Start saving batch 104


Batch 104: 100%|██████████████████████████████████████████| 200000/200000 [1:41:04<00:00, 32.98it/s]


Finished saving batch 104
Start saving batch 105


Batch 105: 100%|██████████████████████████████████████████| 200000/200000 [1:38:37<00:00, 33.80it/s]


Finished saving batch 105
Start saving batch 106


Batch 106: 100%|██████████████████████████████████████████| 200000/200000 [1:41:08<00:00, 32.96it/s]


Finished saving batch 106
Start saving batch 107


Batch 107: 100%|██████████████████████████████████████████| 200000/200000 [1:40:00<00:00, 33.33it/s]


Finished saving batch 107
Start saving batch 108


Batch 108: 100%|██████████████████████████████████████████| 200000/200000 [1:41:03<00:00, 32.99it/s]


Finished saving batch 108
Start saving batch 109


Batch 109: 100%|██████████████████████████████████████████| 200000/200000 [1:40:38<00:00, 33.12it/s]


Finished saving batch 109
Start saving batch 110


Batch 110: 100%|██████████████████████████████████████████| 200000/200000 [1:41:52<00:00, 32.72it/s]


Finished saving batch 110
Start saving batch 111


Batch 111: 100%|██████████████████████████████████████████| 200000/200000 [1:37:40<00:00, 34.13it/s]


Finished saving batch 111
Start saving batch 112


Batch 112: 100%|██████████████████████████████████████████| 200000/200000 [1:45:04<00:00, 31.72it/s]


Finished saving batch 112
Start saving batch 113


Batch 113: 100%|██████████████████████████████████████████| 200000/200000 [1:42:35<00:00, 32.49it/s]


Finished saving batch 113
Start saving batch 114


Batch 114: 100%|██████████████████████████████████████████| 200000/200000 [1:39:09<00:00, 33.62it/s]


Finished saving batch 114
Start saving batch 115


Batch 115: 100%|██████████████████████████████████████████| 200000/200000 [1:44:46<00:00, 31.81it/s]


Finished saving batch 115
Start saving batch 116


Batch 116: 100%|██████████████████████████████████████████| 200000/200000 [1:41:50<00:00, 32.73it/s]


Finished saving batch 116
Start saving batch 117


Batch 117: 100%|██████████████████████████████████████████| 200000/200000 [1:41:11<00:00, 32.94it/s]


Finished saving batch 117
Start saving batch 118


Batch 118: 100%|██████████████████████████████████████████| 200000/200000 [1:41:14<00:00, 32.92it/s]


Finished saving batch 118
Start saving batch 119


Batch 119: 100%|██████████████████████████████████████████| 200000/200000 [1:41:25<00:00, 32.86it/s]


Finished saving batch 119
Start saving batch 120


Batch 120: 100%|██████████████████████████████████████████| 200000/200000 [1:41:45<00:00, 32.76it/s]


Finished saving batch 120
Start saving batch 121


Batch 121: 100%|██████████████████████████████████████████| 200000/200000 [1:35:18<00:00, 34.97it/s]


Finished saving batch 121
Start saving batch 122


Batch 122: 100%|██████████████████████████████████████████| 200000/200000 [1:41:29<00:00, 32.84it/s]


Finished saving batch 122
Start saving batch 123


Batch 123: 100%|██████████████████████████████████████████| 200000/200000 [1:41:42<00:00, 32.77it/s]


Finished saving batch 123
Start saving batch 124


Batch 124: 100%|██████████████████████████████████████████| 200000/200000 [1:43:20<00:00, 32.26it/s]


Finished saving batch 124
Start saving batch 125


Batch 125: 100%|██████████████████████████████████████████| 200000/200000 [1:44:38<00:00, 31.86it/s]


Finished saving batch 125
Start saving batch 126


Batch 126: 100%|██████████████████████████████████████████| 200000/200000 [1:41:56<00:00, 32.70it/s]


Finished saving batch 126
Start saving batch 127


Batch 127: 100%|██████████████████████████████████████████| 200000/200000 [1:41:42<00:00, 32.77it/s]


Finished saving batch 127
Start saving batch 128


Batch 128: 100%|██████████████████████████████████████████| 200000/200000 [1:41:31<00:00, 32.83it/s]


Finished saving batch 128
Start saving batch 129


Batch 129: 100%|██████████████████████████████████████████| 200000/200000 [1:40:44<00:00, 33.09it/s]


Finished saving batch 129
Start saving batch 130


Batch 130: 100%|██████████████████████████████████████████| 200000/200000 [1:40:40<00:00, 33.11it/s]


Finished saving batch 130
Start saving batch 131


Batch 131: 100%|██████████████████████████████████████████| 200000/200000 [1:41:38<00:00, 32.80it/s]


Finished saving batch 131
Start saving batch 132


Batch 132: 100%|██████████████████████████████████████████| 200000/200000 [1:43:38<00:00, 32.16it/s]


Finished saving batch 132
Start saving batch 133


Batch 133: 100%|██████████████████████████████████████████| 200000/200000 [1:43:03<00:00, 32.34it/s]


Finished saving batch 133
Start saving batch 134


Batch 134: 100%|██████████████████████████████████████████| 200000/200000 [1:41:31<00:00, 32.83it/s]


Finished saving batch 134
Start saving batch 135


Batch 135: 100%|██████████████████████████████████████████| 200000/200000 [1:36:38<00:00, 34.49it/s]


Finished saving batch 135
Start saving batch 136


Batch 136: 100%|██████████████████████████████████████████| 200000/200000 [1:36:07<00:00, 34.68it/s]


Finished saving batch 136
Start saving batch 137


Batch 137: 100%|██████████████████████████████████████████| 200000/200000 [1:39:15<00:00, 33.58it/s]


Finished saving batch 137
Start saving batch 138


Batch 138: 100%|██████████████████████████████████████████| 200000/200000 [1:39:38<00:00, 33.45it/s]


Finished saving batch 138
Start saving batch 139


Batch 139: 100%|██████████████████████████████████████████| 200000/200000 [1:45:01<00:00, 31.74it/s]


Finished saving batch 139
Start saving batch 140


Batch 140: 100%|██████████████████████████████████████████| 200000/200000 [1:45:14<00:00, 31.67it/s]


Finished saving batch 140
Start saving batch 141


Batch 141: 100%|██████████████████████████████████████████| 200000/200000 [1:40:23<00:00, 33.20it/s]


Finished saving batch 141
Start saving batch 142


Batch 142: 100%|██████████████████████████████████████████| 200000/200000 [1:40:34<00:00, 33.14it/s]


Finished saving batch 142
Start saving batch 143


Batch 143: 100%|██████████████████████████████████████████| 200000/200000 [1:40:07<00:00, 33.29it/s]


Finished saving batch 143
Start saving batch 144


Batch 144: 100%|██████████████████████████████████████████| 200000/200000 [1:42:31<00:00, 32.51it/s]


Finished saving batch 144
Start saving batch 145


Batch 145: 100%|██████████████████████████████████████████| 200000/200000 [1:42:16<00:00, 32.59it/s]


Finished saving batch 145
Start saving batch 146


Batch 146: 100%|██████████████████████████████████████████| 200000/200000 [1:39:23<00:00, 33.54it/s]


Finished saving batch 146
Start saving batch 147


Batch 147: 100%|██████████████████████████████████████████| 200000/200000 [1:40:18<00:00, 33.23it/s]


Finished saving batch 147
Start saving batch 148


Batch 148: 100%|██████████████████████████████████████████| 200000/200000 [1:40:31<00:00, 33.16it/s]


Finished saving batch 148
Start saving batch 149


Batch 149: 100%|██████████████████████████████████████████| 200000/200000 [1:39:46<00:00, 33.41it/s]


Finished saving batch 149
Start saving batch 150


Batch 150: 100%|██████████████████████████████████████████| 200000/200000 [1:45:22<00:00, 31.63it/s]

Finished saving batch 150



