In [None]:
import re
import subprocess
from typing import List

import pandas as pd
import glob
from pathlib import Path
from tqdm import tqdm

In [None]:
DATA_PATH = Path("../data")
DATA_PATH.mkdir(exist_ok=True)

In [None]:
download_command = [
    "aws", "s3", "sync",
    "--no-sign-request",
    "s3://physionet-open/challenge-2019/1.0.0/training/",
    str(DATA_PATH)
]

process = subprocess.Popen(
    download_command,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True,
    bufsize=1
)

for line in process.stdout:
    print(line, end="")

return_code = process.wait()
if return_code != 0:
    raise subprocess.CalledProcessError(return_code, download_command)

In [None]:
patient_file_paths = glob.glob(str(DATA_PATH / "training_set*/*.psv"))

In [None]:
dfs: List[pd.DataFrame] = []
for patient_file in tqdm(patient_file_paths, desc="Loading .psv files"):
    df = pd.read_csv(patient_file, sep='|')

    match = re.search(r'patient_file(\d+)\.psv$', patient_file)
    if not match:
        continue

    patient_id = int(match.group(1)) if match else None
    df["Patient_ID"] = patient_id
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)

In [None]:
combined_df.to_parquet(DATA_PATH / "dataset.parquet", engine="pyarrow")
print("Saved combined dataset to ../data2/dataset.parquet")