# Data preparation

Creates interim data files for both Enroll-HD and Registry

* Created: Tue Jul 21 14:04:16 2020
* Author: greed

In [1]:
from pyprojroot import here
import pandas

In [2]:
from hem.enroll import Enroll

In [3]:
raw_path = here() / "data" / "raw"

In [4]:
interim_path = here() / "data" / "interim"

In [5]:
subject_cols = ["sex", "region", "hddiagn", "race", "caghigh", "dssage"]

## Load subject and study data

In [None]:
%%time
enroll = Enroll()
enroll.load_subjects(raw_path / "profile.csv")
enroll.load_studies(raw_path / "participation.csv")

## With ENROLL

In [7]:
out_file = interim_path / "enroll.csv"

In [None]:
%%time
enroll.load_visits(raw_path / "enroll.csv")

In [None]:
%%time
df = enroll.get_visits(
    subject_cols=subject_cols,
    study_cols=["age_0", "hdcat_0"],
)

In [None]:
print(f'{df.shape[0]} rows, {df.shape[1]} columns, {df["subjid"].nunique()} subjects')

In [11]:
df = (
    df.sort_values(["subjid", "visdy"])
    .reset_index(drop=True)
    # .convert_dtypes()
)

In [None]:
%%time
df.to_feather(out_file.with_suffix(".feather"))

In [None]:
%%time
df.to_csv(out_file, line_terminator="\n", index=None)

In [None]:
with out_file.with_suffix(".txt").open("w") as f:
    df.info(verbose=True, null_counts=True, buf=f)

## Plus REGISTRY and ADHOC

In [15]:
out_file = interim_path / "enroll-all.csv"

In [None]:
%%time
enroll.load_visits(raw_path / "registry.csv")
enroll.load_visits(raw_path / "adhoc.csv")

In [None]:
%%time
df = enroll.get_visits(
    subject_cols=subject_cols,
    study_cols=["age_0", "hdcat_0"],
)

In [None]:
print(f'{df.shape[0]} rows, {df.shape[1]} columns, {df["subjid"].nunique()} subjects')

In [19]:
df = (
    df.sort_values(["subjid", "visdy"])
    .reset_index(drop=True)
    # .convert_dtypes()
)

In [None]:
%%time
df.to_feather(out_file.with_suffix(".feather"))

In [None]:
%%time
df.to_csv(out_file, line_terminator="\n", index=None)

In [None]:
with out_file.with_suffix(".txt").open("w") as f:
    df.info(verbose=True, null_counts=True, buf=f)