# Data Preprocessing

This python notebook contains code needed to extract data for analysis from several source files containing personally identifiable information (PII). The outputs are data files in `data/main` that contain only data values for analysis and participant IDs, but no PII.

In [None]:
import pandas as pd
import numpy as np

In [None]:
from pathlib import Path
Path("../data/main").mkdir(parents=True, exist_ok=True)

In [None]:
#reading only specific fields from redcap data to avoid PII
#using redcap record_id as pandas index
rc_c = pd.read_csv(
    '../data/RAW/redcap/child.csv',
    usecols=lambda c: c in ['record_id','id_child','id_family','mira2_lapsen_taustatieto_ja_ruoankyttkysely_timestamp'] or c.startswith('diet_'),
    index_col=0
)

In [None]:
#fixing known input errors

#capitalizing all ids
rc_c.id_family = rc_c.id_family.str.capitalize()
rc_c.id_child = rc_c.id_child.str.capitalize()

In [None]:
#helper function to update value of given column on row indexed by redcap record_id
def set_col_for_record(record_id, col, val):
    
    #at points to a single cell by index and col name
    old_val = rc_c.at[record_id,col]
    idf = rc_c.at[record_id,'id_family']
    idc = rc_c.at[record_id,'id_child']
    
    rc_c.at[record_id,col] = val

    print(f'{record_id}, {idf}, {idc}, {col}: {old_val} --> {val}')

In [None]:
# M3339 shows as not completed in form data, but data is valid and input date confirmed directly from participant

set_col_for_record(
    70,
    'mira2_lapsen_taustatieto_ja_ruoankyttkysely_timestamp',
    '2022-06-21 12:00:00'
)


In [None]:
# helper functions to define valid IDs and returning invalid rows

def is_valid_id(s):
    return isinstance(s,str) and len(s)==5 and s[-4:].isdigit()

def is_valid_id_family(s):
    return is_valid_id(s) and s.startswith('P')

def is_valid_id_child(s):
    return is_valid_id(s) and s.startswith('M')

def erroneous_ids():
    valid_id_family = rc_c.id_family.apply(is_valid_id_family)
    valid_id_child = rc_c.id_child.apply(is_valid_id_child)
    completed = rc_c.mira2_lapsen_taustatieto_ja_ruoankyttkysely_timestamp != '[not completed]'
    
    return rc_c.loc[
        completed & (
            ~valid_id_family | ~valid_id_child
        )
    ]
    

In [None]:
erroneous_ids()

In [None]:
#P was missing from family_id for M3309
set_col_for_record(
    65,
    'id_family',
    'P1001'
)

In [None]:
#P was missing from family_id for M3310
set_col_for_record(
    66,
    'id_family',
    'P1001'
)

In [None]:
#P was replaced with zero in family_id for M3313
set_col_for_record(
    57,
    'id_family',
    'P1002'
)

In [None]:
#family_id incorrect M3400
set_col_for_record(
    78,
    'id_family',
    'P1030'
)

In [None]:
#family_id had i for 1 M3416
set_col_for_record(
    13,
    'id_family',
    'P1035'
)

In [None]:
#family_id was M3495
set_col_for_record(
    37,
    'id_family',
    'P1060'
)

In [None]:
#family_id was M3501
set_col_for_record(
    11,
    'id_family',
    'P1062'
)

In [None]:
#swapping values where family- and child ids are swapped
m = rc_c.id_family.str.startswith('M') & rc_c.id_child.str.startswith('P')

rc_c.loc[m, ['id_family', 'id_child']] = (
    rc_c.loc[m, ['id_child', 'id_family']].values)

In [None]:
erroneous_ids()

In [None]:
rc_c.to_csv(
    '../data/main/redcap_child_diet.csv',
    index=False
)

In [None]:
#full list of food intakes
food_intake_records = pd.read_excel(
    '../data/RAW/aromi_product_ingredient.xlsx', 
    sheet_name='Tutkimusraportti',
).drop(
    columns=[
        'Syntymäaika',
        'Ryhmätagi',
        'Ruokapäiväkirjaryhmä',
        'Ruokapäiväkirjan nimi',
    ]
)

food_intake_records.to_csv(
    '../data/main/food_intake_records.csv',
    index=False
)

In [None]:
import shutil
shutil.copy2('../data/RAW/families.csv', '../data/main/')