# Reformat dataframes to be memory-efficient

**NOTE**: you need at least 160GB of RAM to reformat the data

In [2]:
from tqdm.auto import tqdm
import pyarrow as pa
import pandas as pd
import toml
import os

## Replace BASEDIR in `analysis_configuration.toml` with path to zenodo data download

In [2]:
# folder where data was downloaded
DATA_DIR = os.path.expanduser("~/zenodo_download/")


with open(
    "../analysis_configuration.TEMPLATE",
    "r",
) as f:
    config_str = f.read()

config_str = config_str.replace("/BASEDIR", DATA_DIR)

with open(
    "../analysis_configuration.toml",
    "w",
) as f:
    f.write(config_str)

In [3]:
with open(
    "../analysis_configuration.toml",
    "r",
) as f:
    analysis_config = toml.load(f)

In [4]:
raw_dirs = analysis_config["raw_data"]
proc_dirs = analysis_config["intermediate_results"]
closed_loop_cfg = analysis_config["closed_loop_behavior"]
dlight_cfg = analysis_config["dlight_common"]

In [5]:
files_to_reformat = {
    os.path.join(raw_dirs["closed_loop_behavior"], "closed_loop_behavior.parquet"): {
        "partition_cols": closed_loop_cfg["partition_cols"],
        "compression": "brotli",
    },
    os.path.join(
        raw_dirs["closed_loop_behavior"],
        "closed_loop_behavior_with_simulated_triggers.parquet",
    ): {
        "partition_cols": closed_loop_cfg["partition_cols"],
        "compression": "brotli",
    },
    os.path.join(raw_dirs["dlight"], "dlight_photometry_processed_full.parquet"): {
        "partition_cols": ["area", "mouse_id", "uuid"],
        "compression": "brotli",
    },
}

In [6]:
files_to_reformat

{'/n/groups/datta/win/dopamine-data-release/zenodo-test/optoda_raw_data/closed_loop_behavior.parquet': {'partition_cols': ['experiment_type',
   'area',
   'mouse_id']},
 '/n/groups/datta/win/dopamine-data-release/zenodo-test/optoda_raw_data/closed_loop_behavior_with_simulated_triggers.parquet': {'partition_cols': ['experiment_type',
   'area',
   'mouse_id']},
 '/n/groups/datta/win/dopamine-data-release/zenodo-test/dlight_raw_data/dlight_photometry_processed_full.parquet': {'partition_cols': ['area',
   'mouse_id',
   'uuid']}}

In [7]:
# set to number of CPUs your machine has
pa.set_cpu_count(4)

In [8]:
for new_file, kwargs in tqdm(files_to_reformat.items()):
    fname, ext = os.path.splitext(new_file)
    old_file = f"{fname}_transfer{ext}"
    print(f"Reformating {old_file} to {new_file}")
    _reformat_df = pd.read_parquet(old_file)
    _reformat_df.to_parquet(new_file, **kwargs)

  0%|          | 0/3 [00:00<?, ?it/s]

Reformating /n/groups/datta/win/dopamine-data-release/zenodo-test/optoda_raw_data/closed_loop_behavior_transfer.parquet to /n/groups/datta/win/dopamine-data-release/zenodo-test/optoda_raw_data/closed_loop_behavior.parquet
Reformating /n/groups/datta/win/dopamine-data-release/zenodo-test/optoda_raw_data/closed_loop_behavior_with_simulated_triggers_transfer.parquet to /n/groups/datta/win/dopamine-data-release/zenodo-test/optoda_raw_data/closed_loop_behavior_with_simulated_triggers.parquet
Reformating /n/groups/datta/win/dopamine-data-release/zenodo-test/dlight_raw_data/dlight_photometry_processed_full_transfer.parquet to /n/groups/datta/win/dopamine-data-release/zenodo-test/dlight_raw_data/dlight_photometry_processed_full.parquet
