# Data Preprocessing

This python notebook contains code needed to extract data for analysis from several source files containing personally identifiable information (PII). The outputs are data files in `data/main` that contain only data values for analysis and participant IDs, but no PII.

In [None]:
import pandas as pd

In [None]:
from pathlib import Path

DATA_PATH = '../../data'
Path(DATA_PATH+'/main').mkdir(parents=True, exist_ok=True)

# Signups and informed consent

We have one manually maintained spreadsheet for participant contact information.

In [None]:
def clean_id_string(series):
    return series.str.upper().str.strip()

In [None]:
import mysecrets 

import io
import msoffcrypto

decrypted_workbook = io.BytesIO()
with open(DATA_PATH+'/RAW/consent_protected.xlsx', 'rb') as file:
    office_file = msoffcrypto.OfficeFile(file)
    office_file.load_key(password=mysecrets.excel_pw)
    office_file.decrypt(decrypted_workbook)

consent = pd.read_excel(
    decrypted_workbook,
    sheet_name='consent',
    usecols=[
        'id',
        'id_family',
        'diet_reported',
        'daycare',
        'daycare_group',
        'daycare_municipal',
        'discontinued',
        'consent_date',
        'daycare_municipal',
        'dob'
    ]
)
consent.id = clean_id_string(consent.id)
consent.id_family = clean_id_string(consent.id_family)


reserch_steps = pd.read_excel(
    decrypted_workbook,
    sheet_name='reserch_steps',
    usecols=[
        'id',
        'date_blood',
        'urine_sample',
        'fecal_sample',
    ]
)
reserch_steps.id = clean_id_string(reserch_steps.id)

Generating pseudonymzation id's for daycare centers and groups within.

Using an alphabetized list of names to keep generated id mappings fairly stable. They will only change if new inputs are made manually to the source file, which is not expected.

Id stability should not be necessary, though. Analyses using the generated master only need these for grouping. They should not depend on ids being the same in subsequent version of the file.

Daycare centers have unique names. Groups are made globally unique by concatenating center and group name.

In [None]:
keys = sorted(consent.daycare.dropna().unique())

dc_id_map = dict(zip(keys,range(len(keys))))

consent['dc_id'] = consent.daycare.map(dc_id_map)

In [None]:
dc_dcg = consent.daycare + consent.daycare_group

keys = sorted(dc_dcg.dropna().unique())

dc_dcg_id_map = dict(zip(keys,range(len(keys))))

consent['dc_group_id'] = dc_dcg.map(dc_dcg_id_map)

In [None]:
#converting to bool column
consent.daycare_municipal = consent.daycare_municipal==1

In [None]:
#exporting id mapping for later verification

with pd.ExcelWriter(DATA_PATH+'/RAW/daycare_id_mapping.xlsx') as writer:
    pd.DataFrame.from_dict(
        dc_id_map,
        orient='index'
    ).to_excel(
        writer, 
        sheet_name='dc_id'
    )
    
    pd.DataFrame.from_dict(
        dc_dcg_id_map,
        orient='index'
    ).to_excel(
        writer, 
        sheet_name='dc_group_id'
    )

# Redcap questionnaire

Participants filled in background questionnaires in RedCAP, one for each child participant and one for caregivers participating in food records and biometric sampling.

## Children

In [None]:
#using redcap record_id as pandas index
rc_c = pd.read_csv(
    DATA_PATH+'/RAW/redcap/child.csv',
    index_col=0
)
rc_c.id_child = clean_id_string(rc_c.id_child)
rc_c.id_family = clean_id_string(rc_c.id_family)

In [None]:
s = rc_c.mira2_lapsen_taustatieto_ja_ruoankyttkysely_timestamp
rc_c['date_q'] = pd.to_datetime(s.mask(s == '[not completed]'))

In [None]:
#helper function to update value of given column on row indexed by redcap record_id
def set_col_for_record(record_id, col, val):
    
    #at points to a single cell by index and col name
    old_val = rc_c.at[record_id,col]
    idf = rc_c.at[record_id,'id_family']
    idc = rc_c.at[record_id,'id_child']
    
    rc_c.at[record_id,col] = val

    print(f'{record_id}, {idf}, {idc}, {col}: {old_val} --> {val}')

In [None]:
# M3339 shows as not completed in form data, but data is valid and input date confirmed directly from participant

set_col_for_record(
    70,
    'date_q',
    '2022-06-21'
)


In [None]:
#dropping incomplete answers
rc_c = rc_c.dropna(subset=['date_q'])

In [None]:
#fixing known input errors

#capitalizing all ids
rc_c.id_family = rc_c.id_family.str.capitalize()
rc_c.id_child = rc_c.id_child.str.capitalize()

In [None]:
# helper functions to define valid IDs and returning invalid rows

def is_valid_id(s):
    return isinstance(s,str) and len(s)==5 and s[-4:].isdigit()

def is_valid_id_family(s):
    return is_valid_id(s) and s.startswith('P')

def is_valid_id_child(s):
    return is_valid_id(s) and s.startswith('M')

def erroneous_ids():
    valid_id_family = rc_c.id_family.apply(is_valid_id_family)
    valid_id_child = rc_c.id_child.apply(is_valid_id_child)
    
    return rc_c.loc[
        rc_c.date_q.notna() & (
            ~valid_id_family | ~valid_id_child
        )
    ][['id_family','id_child']]
    

In [None]:
erroneous_ids()

In [None]:
#P was missing from family_id for M3309
set_col_for_record(
    65,
    'id_family',
    'P1001'
)

In [None]:
#P was missing from family_id for M3310
set_col_for_record(
    66,
    'id_family',
    'P1001'
)

In [None]:
#P was replaced with zero in family_id for M3313
set_col_for_record(
    57,
    'id_family',
    'P1002'
)

In [None]:
#family_id incorrect M3400
set_col_for_record(
    78,
    'id_family',
    'P1030'
)

In [None]:
#family_id had i for 1 M3416
set_col_for_record(
    13,
    'id_family',
    'P1035'
)

In [None]:
#family_id was M3495
set_col_for_record(
    37,
    'id_family',
    'P1060'
)

In [None]:
#family_id was M3501
set_col_for_record(
    11,
    'id_family',
    'P1062'
)

In [None]:
#swapping values where family- and child ids are swapped
m = rc_c.id_family.str.startswith('M') & rc_c.id_child.str.startswith('P')

rc_c.loc[m, ['id_family', 'id_child']] = (
    rc_c.loc[m, ['id_child', 'id_family']].values)

In [None]:
erroneous_ids()

# Diet classification of participants

Study participants have filled in RedCap survey forms that differ for children and their caretakers. On both forms there are multiple choice questions to more clearly specify the diet participants adhere to.

This code contains all interpretations of survey answers to group participants to diet categories.


## Form for Children
Same row of check boxes was presented for both home and daycare diets:


- Ei erityisruokavaliota tai välttämisruokavaliota
- Laktoositon tai vähälaktoosinen ruokavalio
- Gluteeniton ruokavalio (vältetään vehnää,ruista ja ohraa)
- Ruokavalio, joka ei sisällä punaista lihaa
- Vegaaninen ruokavalio (ei sisällä mitään eläinperäisiä tuotteita)
- Kasvisruokavalio, joka sisältää yhtä tai useampaa seuraavista eläinkunnan tuotteista: kalaa, kananmunaa ja/tai maitotuotteita
- Ruokarajoituksia uskonnollisista syistä
- Muu ruokavalio

The code below encodes selections as `1`s and unchecked boxes as `0`s. For each form, we get an eight-character string of `0`s and `1`s. All combinations appearing in answers are mapped to a diet category according to the research group's interpretation.

In [None]:
diet_id = {
    '00001000': 0, #vegan
    '00101000': 1, #vegan, no gluten
    '00000100': 2, #vegetarian
    '00001100': 3, #contradictory answer: vegan and vegetarian
    '10000100': 4, #contradictory answer: no restrictions and vegetarian
    '01000100': 5, #lactose free vegetarian
    '00010100': 6, #vegetarian, no red meat
    '01000000': 7, #lactose free
    '10000001': 8, #no restrictions, other
    '10000000': 9, #no restrictions
    '00000001': 10, #other
    '00100000': 11, #gluten free
}
diet_desc = {    
    0: 'vegan',
    1: 'vegan, gluten free',
    2: 'vegetarian',
    3: 'vegetarian', 
    4: 'vegetarian',
    5: 'vegetarian, lactose free', 
    6: 'vegetarian',
    7: 'mixed diet, lactose free',
    8: 'mixed diet, other',
    9: 'mixed diet',
    10: 'other',
    11: 'mixed diet, gluten free',
}
diet_group = {    
    0: 'vegan',
    1: 'vegan',
    2: 'vegetarian',
    3: 'vegetarian', 
    4: 'vegetarian',
    5: 'vegetarian', 
    6: 'vegetarian',
    7: 'mixed diet',
    8: 'mixed diet',
    9: 'mixed diet',
    10: 'mixed diet',
    11: 'mixed diet',
}

In [None]:
diet_dc_cols = [f'diet_dc___{i}' for i in range(1,9)]
rc_c['diet_dc_ticks'] = rc_c[diet_dc_cols].astype(str).apply(''.join, axis=1)

diet_home_cols = [f'diet_home___{i}' for i in range(1,9)]
rc_c['diet_home_ticks'] = rc_c[diet_home_cols].astype(str).apply(''.join, axis=1)

rc_c['diet_dc_id'] = rc_c.diet_dc_ticks.map(diet_id)
rc_c['diet_dc_desc'] = rc_c.diet_dc_id.map(diet_desc)

rc_c['diet_home_id'] = rc_c.diet_home_ticks.map(diet_id)
rc_c['diet_home_desc'] = rc_c.diet_home_id.map(diet_desc)

In [None]:
#if an option is selected for either home or daycare, we consider it selected

#combining two strings of 0/1 selections to one where if either one has 1 in a position, output is 1, else 0
def string_bitwise_or(s1,s2):
    l = len(s1)
    assert l == len(s2)
    return bin(int(s1, 2) | int(s2, 2))[2:].zfill(l)

def combine_diets(row):
    return string_bitwise_or(row.diet_dc_ticks, row.diet_home_ticks)

rc_c['diet_group'] = rc_c.apply(combine_diets, axis=1).map(diet_id).map(diet_group)

## Caregivers

In [None]:
#reading adults redcap questionnaire
rc_cg = pd.read_csv(
    DATA_PATH+'/RAW/redcap/caregiver.csv',
    index_col=0
)
# converting strings to timestamps after filtering out not completed
s = rc_cg.mira2_huoltajan_taustatieto_ja_ruoankyttkysely_timestamp
rc_cg['date_q'] = pd.to_datetime(s.mask(s == '[not completed]'))

#removing rows that do not have completed timestamps
rc_cg = rc_cg.dropna(subset=['date_q'])

In [None]:
#function to normalise ID inputs
def fix_cg_id(s):
    s = s.upper()
    if not s.startswith('M'):
        s = 'M' + s
    return s

#apply fix to whole columns
rc_cg.id_cg = rc_cg.id_cg.apply(fix_cg_id)

Fixing known input errors in children's questionnaire

In [None]:
#fixing known input errors, in C

#mother's ID entered into the father's column, emptying it
rc_c.loc[rc_c['id_biological_father'] == 'M3586', 'id_biological_father'] = None 

#did not remember their ID, adding it after checking
rc_c.loc[rc_c['id_biological_father'] == 'En muista', 'id_biological_father'] = 'M3599'

#mother participated, but did not enter ID because they missed blood labs, adding their ID
rc_c.at[68,'id_biological_mother'] = 'M3639' 

Reading adults diet from the children's questionnaire, from several different columns.

For each of biological mother, biological father, caregiver, we find their participant ID and diet multiple choice answers in a different set of columns.

In [None]:
#selecting groups of columns from rc_c, childrens questionnaire, and making them uniform

# generating a list of strings to represent column names for diet multiple choice answers
diet_cols = [f'diet_{i}' for i in range(1,9)]

def slice_and_rename(cols):
    df = rc_c[cols]
    df.columns = ['id_cg'] + diet_cols
    return df

#selecting three groups of columns into dataframes and concatenating row-wise
diet_cg = pd.concat(
    [
        slice_and_rename(['id_biological_mother'] + [f'diet_biol_m___{i}' for i in range(1,9)]),
        slice_and_rename(['id_biological_father'] + [f'diet_biol_f___{i}' for i in range(1,9)]),
        slice_and_rename(['id_cg'] + [f'cg_diet___{i}' for i in range(1,9)]),
    ],
    axis = 0
).dropna()

diet_cg.id_cg = diet_cg.id_cg.apply(fix_cg_id)

# Same caregiver ID can appear repeatedly when more than one of their children participate
# If an answer was selected in any instance of answers, we consider it
diet_cg = diet_cg.groupby('id_cg').max() #columns contain 0 or 1, max() chooses 1 if selected

diet_cg['diet_ticks'] = diet_cg[diet_cols].astype(str).apply(''.join, axis=1)
diet_cg['diet_id'] = diet_cg.diet_ticks.map(diet_id)
diet_cg['diet_home'] = diet_cg.diet_id.map(diet_desc)
diet_cg['diet_group'] = diet_cg.diet_id.map(diet_group)

# Samples

Participants gave fecal and urine samples for lab testing. Age at sample time is calculated for analysis and to hide DoB. Existence of date is used to track which participants gave samples.

In [None]:
f_samples = pd.read_excel(
    DATA_PATH+'/RAW/fecal_and_urine_samples.xlsx',
    sheet_name='feces',
    usecols=[
        'id',
        'fecal_date',
    ]
)

In [None]:
u_samples = pd.read_excel(
    DATA_PATH+'/RAW/fecal_and_urine_samples.xlsx',
    sheet_name='urines',
    usecols=[
        'id',
        'urine_date',
    ]
)

In [None]:
#time inputs in spreadsheet are all kinds of off
from datetime import datetime, time
from math import isnan

def clean_time(t):
    if isinstance(t, time):
        #correct inputs are parsed when reading excel, keeping those
        return t
    if isinstance(t, float):
        #empty cells come in as NaN
        if isnan(t):
            return None
        #short inputs get parsed as float
        t = str(t)
    if isinstance(t, str):
        #remaining problems should be strings
        #removing any spaces and replacing . with : 
        t = t.strip(' .').replace(' ','').replace('.',':')
        if t.count(':')<2:
            #in some cases seconds are missing, so adding 00
            t += ':00'
        return datetime.strptime(t, "%H:%M:%S").time()
    raise Exception(f'Not prepared to process {type(t)}')

b_samples = pd.read_excel(
    DATA_PATH+'/RAW/blood_samples.xlsx',
    sheet_name='blood_sample_collection',
    usecols=[
        'id',
        'blood_collected_time',
    ],
)

b_samples['blood_time_of_day'] = b_samples.blood_collected_time.apply(clean_time)

# Merging

Merging dataframes in participant ID.

For privacy, no date columns are written out to master, age at sample times is calculated.

In [None]:
#starting from informed consent, using left joins so anything not in there will not be included
participants = consent.merge(
    reserch_steps,
    left_on=['id'],
    right_on=['id'],
    how='left'
)
len(participants.index)

In [None]:
#children's redcap
participants = participants.merge(
    rc_c,
    left_on=['id'],
    right_on=['id_child'],
    how='left',
    suffixes=('_consent', '_redcap')
)
len(participants.index)

In [None]:
def merge_cols(df,c):
    x = f'{c}_x'
    y = f'{c}_y'
    display(df[[x,y]].dropna())
    df[c] = df[x].combine_first(df[y])

In [None]:
#caregivers redcap
participants = participants.merge(
    rc_cg,
    left_on='id',
    right_on='id_cg',
    how='left',
)


#there are columns that appear by same name on both questionnaires but
#only appear in one or the other for child or caregiver, combining those here
merge_cols(participants,'date_q')
merge_cols(participants,'sex')

len(participants.index)

In [None]:
#caregivers diet
participants = participants.merge(
    diet_cg,
    left_on='id',
    right_index=True,
    how='left',
)

merge_cols(participants,'diet_group')

len(participants.index)

In [None]:
participants = participants.merge(
    f_samples,
    on='id',
    how='left',
)
len(participants.index)

In [None]:
participants = participants.merge(
    u_samples,
    on='id',
    how='left',
)
len(participants.index)

In [None]:
participants = participants.merge(
    b_samples,
    on='id',
    how='left',
)
len(participants.index)

In [None]:
ages = pd.DataFrame()

ages['age_q'] = (participants.date_q.dt.normalize() - participants.dob).dt.days
ages['age_informed_consent'] = (participants.consent_date - participants.dob).dt.days
ages['age_blood'] = (pd.to_datetime(participants.date_blood, errors='coerce') - participants.dob).dt.days
ages['age_feces'] = (participants.fecal_date - participants.dob).dt.days
ages['age_urine'] = (participants.urine_date - participants.dob).dt.days

participants = pd.concat([participants,ages],axis=1)

## ID Checks

In [None]:
#checking for mismatched family ids
participants[participants.id_family_redcap.notna() & (participants.id_family_consent != participants.id_family_redcap)][['id','id_family_consent','id_family_redcap']]

In [None]:
#number of family ids missing in redcap data 
participants.id_family_redcap.isna().sum()

In [None]:
#using family_id from consent data
participants['id_family'] = participants.id_family_consent

# Data file outputs

## Participants master
Combining person-level data to a single table without PII.

For dta hygiene, only explicitly selected columns are written out to the master data file on `data/main`.

In [None]:
for c in participants.columns: print(c)

In [None]:
master_dictionary = {
    'id': 'unique identifier of study participant',
    'id_family': 'unique identifier participating family',
    'age_informed_consent': 'age of participant at time reserch group received informed consent',
    'diet_reported': 'diet reported at enrollment',
    'daycare_municipal': 'boolean denoting whether daycare center attended is municipally operated',
    'discontinued': 'boolean denoting whether participant discontinued the study before completion',
    'dc_id': 'unique identifier of daycare center',
    'dc_group_id': 'unique identifier of group at daycare center',
    'age_blood': 'age of participant at time of blood sampling',
    'blood_time_of_day': 'time of day when blood sample was taken',
    'age_feces': 'age of participant at time of fecal sampling',
    'age_urine': 'age of participant at time of urine sampling',
    'sex': 'sex of participant (1=F, 2=M)',
    'age_q': 'age of participant at time of completing questionnaire',
    'diet_dc_desc': 'description label for diet group at daycare based on questionnaire',
    'diet_home_desc': 'description label for diet group at home based on questionnaire',
    'diet_group': 'diet grouping the study, based on questionnaire answers'
}

In [None]:
with pd.ExcelWriter(DATA_PATH+'/main/participants_master.xlsx') as writer:
    
    #selecting named columns from dataframe participants 
    participants[master_dictionary.keys()].to_excel(
        writer, 
        sheet_name='participants',
        index=False
    )
    
    #turning the dictionary to a dataframe for writing out to excel
    pd.DataFrame.from_dict(
        master_dictionary,
        orient='index',
        columns=['description']
    ).rename_axis(
        'variable'
    ).to_excel(
        writer, 
        sheet_name='dictionary',
    )

## Cleaned-up questionnaire data
Exporting a pseudonymised, but otherwise unprocessed copy of the questionnaire.

In [None]:
rc_c.drop(
    columns=[
        'birthday',
        'birth_date',
        'name_cg', 
        'cg_who', 
        'neuvolakortti', 
        'additional_information',
        'mira2_lapsen_taustatieto_ja_ruoankyttkysely_timestamp',
        'date_q'
    ]
).to_excel(
    DATA_PATH+'/main/redcap_children_clean.xlsx'
)