# Data Preprocessing

This python notebook contains code needed to extract data for analysis from several source files containing personally identifiable information (PII). The outputs are data files in data/main that contain only data values for analysis and participant IDs, but no PII.

In [None]:
import pandas as pd
import numpy as np

In [None]:
#copy mira2_masterdata.xlsx manually from group share to the RAW data folder
raw_biomarkers = pd.read_excel('../data/RAW/mira2_masterdata.xlsx')

#birth date is used to calculate age at sampling times, those could be used to calculate DoB
#dropping all dates and keeping only the calculated age
biomarkers = raw_biomarkers.drop(
    ['date_birth','date_blood','date_feces','date_urine','date_anthro'],
    axis=1
)
biomarkers.to_csv(
    '../data/main/biomarkers.csv',
    index=False
)

In [None]:
#reading only specific fields from redcap data to avoid PII
df = pd.read_csv(
    '../data/RAW/redcap/child.csv',
    usecols=lambda c: c in ['id_child','id_family','mira2_lapsen_taustatieto_ja_ruoankyttkysely_timestamp'] or c.startswith('diet_')
)

#fixing known input errors

#capitalizing all ids
df.id_family = df.id_family.str.capitalize()
df.id_child = df.id_child.str.capitalize()

#swapping values where family- and child ids are swapped
m = df.id_family.str.startswith('M') & df.id_child.str.startswith('P')

df.loc[m, ['id_family', 'id_child']] = (
    df.loc[m, ['id_child', 'id_family']].values)

df.to_csv(
    '../data/main/redcap_child_diet.csv',
    index=False
)

In [None]:
#full list of food intakes
food_intake_records = pd.read_excel(
    '../data/RAW/aromi_product_ingredient.xlsx', 
    sheet_name='Tutkimusraportti',
).drop(
    columns=[
        'Syntymäaika',
        'Ryhmätagi',
        'Ruokapäiväkirjaryhmä',
        'Ruokapäiväkirjan nimi',
    ]
)

food_intake_records.to_csv(
    '../data/main/food_intake_records.csv',
    index=False
)

In [None]:
import shutil
shutil.copy2('../data/RAW/families.csv', '../data/main/')