In [1]:
from pathlib import Path

import pyarrow as pa
import pyarrow.csv
import pyarrow.compute
import pyarrow.parquet as pq
from tqdm.auto import tqdm

In [2]:
schema_in = pa.schema([
    ("timestamp", pa.timestamp("us")),
    ("location_lat", pa.float64()),
    ("location_long", pa.float64()),
    ("individual_id", pa.int64()),
    ("deployment_id", pa.int64()),
    ("tag_id", pa.int64()),
    ("visible", pa.bool_()),
    ("sensor_type", pa.string()),
])

schema_out = pa.schema([
    ("timestamp", pa.timestamp("us")),
    ("location_lat", pa.float64()),
    ("location_long", pa.float64()),
    ("study_id", pa.int64()),
    ("individual_id", pa.int64()),
    ("deployment_id", pa.int64()),
    ("tag_id", pa.int64()),
    ("visible", pa.bool_()),
    ("sensor_type", pa.string())
])

In [3]:
# Convert from CSV to one big parquet file

with pq.ParquetWriter("locations_gps.parquet", schema=schema_out) as writer:
    for path in tqdm(sorted(Path("studies").glob("*/location.csv"))):
        # Read the CSV file using pyarrow
        try:
            table = pa.csv.read_csv(path, convert_options=pa.csv.ConvertOptions(column_types=schema_in))
        except pa.ArrowInvalid:
            continue
        
        # Clean up the data, limit to GPS
        table = table.drop_null()
        table = table.filter(table["visible"])
        table = table.filter(pa.compute.and_(table["visible"], pa.compute.equal(table["sensor_type"], "gps")))
        table = table.sort_by([("individual_id", "ascending"),
                               ("deployment_id", "ascending"),
                               ("timestamp", "ascending")])

        # Add the study_id
        study_id = int(path.parent.name.split("-")[0])
        table = table.add_column(3, "study_id", pa.array(len(table) * [study_id], pa.int64()))

        # Append to the output file
        writer.write_table(table)

  0%|          | 0/476 [00:00<?, ?it/s]

In [4]:
# Convert metadata about individuals to one big parquet file

schema_indiv = pa.schema([
    ("id", pa.int64()),
    ("earliest_date_born", pa.timestamp("us")),
    ("latest_date_born", pa.timestamp("us")),
    ("exact_date_of_birth", pa.timestamp("us")),
    ("local_identifier", pa.string()),
    ("sex", pa.string()),
    ("taxon_canonical_name", pa.string()),
])

with pq.ParquetWriter("individuals.parquet", schema=schema_indiv) as writer:
    for path in tqdm(sorted(Path("studies").glob("*/individual.csv"))):
        # Read the CSV file using pyarrow
        try:
            table = pa.csv.read_csv(path, convert_options=pa.csv.ConvertOptions(column_types=schema_indiv))
        except pa.ArrowInvalid:
            continue
        
        # Keep only wanted columns
        table = table.select(schema_indiv.names)

        # Append to the output file
        writer.write_table(table)

  0%|          | 0/476 [00:00<?, ?it/s]