# Data Preprocessing

This python notebook contains code needed to extract data for analysis from several source files containing personally identifiable information (PII). The outputs are data files in `data/main` that contain only data values for analysis and participant IDs, but no PII.

In [None]:
import pandas as pd

In [None]:
from pathlib import Path
Path("../data/main").mkdir(parents=True, exist_ok=True)

# Signups and informed consent

We have one manually maintained spreadsheet for participant contact information.

In [None]:
import secrets 

import io
import msoffcrypto

decrypted_workbook = io.BytesIO()
with open('../data/RAW/consent_protected.xlsx', 'rb') as file:
    office_file = msoffcrypto.OfficeFile(file)
    office_file.load_key(password=secrets.excel_pw)
    office_file.decrypt(decrypted_workbook)

consent = pd.read_excel(
    decrypted_workbook,
    sheet_name='consent',
    usecols=[
        'id',
        'id_family',
        'diet_reported',
        'daycare',
        'daycare_group',
        'daycare_municipal',
        'discontinued',
        'consent_date',
        'daycare_municipal',
    ]
)

reserch_steps = pd.read_excel(
    decrypted_workbook,
    sheet_name='reserch_steps',
    usecols=[
        'id',
        'date_blood',
        'urine_sample',
        'fecal_sample',
    ]
)

Generating pseudonymzation id's for daycare centers and groups within.

Using an alphabetized list of names to keep generated id mappings fairly stable. They will only change if new inputs are made manually to the source file, which is not expected.

Id stability should not be necessary, though. Analyses using the generated master only need these for grouping. They should not depend on ids being the same in subsequent version of the file.

Daycare centers have unique names. Groups are made globally unique by concatenating center and group name.

In [None]:
keys = sorted(consent.daycare.dropna().unique())

id_map = dict(zip(keys,range(len(keys))))

consent['dc_id'] = consent.daycare.map(id_map)

In [None]:
dc_dcg = consent.daycare + consent.daycare_group

keys = sorted(dc_dcg.dropna().unique())

id_map = dict(zip(keys,range(len(keys))))

consent['dc_group_id'] = dc_dcg.map(id_map)

In [None]:
#converting to bool column
consent.daycare_municipal = consent.daycare_municipal==1

# Redcap questionnaire

Participants filled in background questionnaires in RedCAP, one for each child participant and one for caregivers participating in food records and biometric sampling.

## Children

In [None]:
#reading only specific fields from redcap data to avoid PII
#using redcap record_id as pandas index
rc_c = pd.read_csv(
    '../data/RAW/redcap/child.csv',
    usecols=lambda c: c in ['record_id','id_child','id_family','mira2_lapsen_taustatieto_ja_ruoankyttkysely_timestamp','birthday','sex'] or c.startswith('diet_'),
    index_col=0
)

In [None]:
s = rc_c.mira2_lapsen_taustatieto_ja_ruoankyttkysely_timestamp
rc_c['date_q'] = pd.to_datetime(s.mask(s == '[not completed]'))

In [None]:
#helper function to update value of given column on row indexed by redcap record_id
def set_col_for_record(record_id, col, val):
    
    #at points to a single cell by index and col name
    old_val = rc_c.at[record_id,col]
    idf = rc_c.at[record_id,'id_family']
    idc = rc_c.at[record_id,'id_child']
    
    rc_c.at[record_id,col] = val

    print(f'{record_id}, {idf}, {idc}, {col}: {old_val} --> {val}')

In [None]:
set_col_for_record(
    39,
    'birthday',
    '2015-08-12'
)

rc_c.birthday = pd.to_datetime(rc_c.birthday)

In [None]:
# M3339 shows as not completed in form data, but data is valid and input date confirmed directly from participant

set_col_for_record(
    70,
    'date_q',
    '2022-06-21'
)


In [None]:
#dropping incomplete answers
rc_c = rc_c[rc_c.date_q.notna()]

In [None]:
#fixing known input errors

#capitalizing all ids
rc_c.id_family = rc_c.id_family.str.capitalize()
rc_c.id_child = rc_c.id_child.str.capitalize()

In [None]:
# helper functions to define valid IDs and returning invalid rows

def is_valid_id(s):
    return isinstance(s,str) and len(s)==5 and s[-4:].isdigit()

def is_valid_id_family(s):
    return is_valid_id(s) and s.startswith('P')

def is_valid_id_child(s):
    return is_valid_id(s) and s.startswith('M')

def erroneous_ids():
    valid_id_family = rc_c.id_family.apply(is_valid_id_family)
    valid_id_child = rc_c.id_child.apply(is_valid_id_child)
    
    return rc_c.loc[
        rc_c.date_q.notna() & (
            ~valid_id_family | ~valid_id_child
        )
    ][['id_family','id_child']]
    

In [None]:
erroneous_ids()

In [None]:
#P was missing from family_id for M3309
set_col_for_record(
    65,
    'id_family',
    'P1001'
)

In [None]:
#P was missing from family_id for M3310
set_col_for_record(
    66,
    'id_family',
    'P1001'
)

In [None]:
#P was replaced with zero in family_id for M3313
set_col_for_record(
    57,
    'id_family',
    'P1002'
)

In [None]:
#family_id incorrect M3400
set_col_for_record(
    78,
    'id_family',
    'P1030'
)

In [None]:
#family_id had i for 1 M3416
set_col_for_record(
    13,
    'id_family',
    'P1035'
)

In [None]:
#family_id was M3495
set_col_for_record(
    37,
    'id_family',
    'P1060'
)

In [None]:
#family_id was M3501
set_col_for_record(
    11,
    'id_family',
    'P1062'
)

In [None]:
#swapping values where family- and child ids are swapped
m = rc_c.id_family.str.startswith('M') & rc_c.id_child.str.startswith('P')

rc_c.loc[m, ['id_family', 'id_child']] = (
    rc_c.loc[m, ['id_child', 'id_family']].values)

In [None]:
erroneous_ids()

In [None]:
rc_c['age_q'] = (rc_c.date_q.dt.normalize() - rc_c.birthday).dt.days

# Diet classification of participants

Study participants have filled in RedCap survey forms that differ for children and their caretakers. On both forms there are multiple choice questions to more clearly specify the diet participants adhere to.

This code contains all interpretations of survey answers to group participants to diet categories.


## Form for Children
Same row of check boxes was presented for both home and daycare diets:


- Ei erityisruokavaliota tai välttämisruokavaliota
- Laktoositon tai vähälaktoosinen ruokavalio
- Gluteeniton ruokavalio (vältetään vehnää,ruista ja ohraa)
- Ruokavalio, joka ei sisällä punaista lihaa
- Vegaaninen ruokavalio (ei sisällä mitään eläinperäisiä tuotteita)
- Kasvisruokavalio, joka sisältää yhtä tai useampaa seuraavista eläinkunnan tuotteista: kalaa, kananmunaa ja/tai maitotuotteita
- Ruokarajoituksia uskonnollisista syistä
- Muu ruokavalio

The code below encodes selections as `1`s and unchecked boxes as `0`s. For each form, we get an eight-character string of `0`s and `1`s. All combinations found are mapped to a diet category according to the research group's interpretation.

In [None]:
diet_id = {
    '00001000': 0,
    '00000100': 1,
    '00001100': 1, #contradictory answer
    '10000100': 1, #contradictory answer
    '01000100': 2,
    '00010100': 3,
    '01000000': 4,
    '10000001': 5,
    '10000000': 6,
}
diet_name = {    
    0: 'vegan',
    1: 'vegetarian',
    2: 'vegetarian lactose-free',
    3: 'vegetarian no red meat', 
    4: 'mixed diet lactose-free',
    5: 'mixed diet other diet', 
    6: 'mixed diet',
}
diet_main = {    
    0: 'vegan',
    1: 'vegetarian',
    2: 'vegetarian',
    3: 'vegetarian', 
    4: 'mixed_diet',
    5: 'mixed_diet', 
    6: 'mixed_diet',
}

In [None]:
rc_c['diet_dc_ticks'] = \
rc_c.diet_dc___1.astype(str) + \
rc_c.diet_dc___2.astype(str) + \
rc_c.diet_dc___3.astype(str) + \
rc_c.diet_dc___4.astype(str) + \
rc_c.diet_dc___5.astype(str) + \
rc_c.diet_dc___6.astype(str) + \
rc_c.diet_dc___7.astype(str) + \
rc_c.diet_dc___8.astype(str)

rc_c['diet_home_ticks'] = \
rc_c.diet_home___1.astype(str) + \
rc_c.diet_home___2.astype(str) + \
rc_c.diet_home___3.astype(str) + \
rc_c.diet_home___4.astype(str) + \
rc_c.diet_home___5.astype(str) + \
rc_c.diet_home___6.astype(str) + \
rc_c.diet_home___7.astype(str) + \
rc_c.diet_home___8.astype(str)

rc_c['diet_dc_id'] = rc_c.diet_dc_ticks.map(diet_id)
rc_c['diet_dc'] = rc_c.diet_dc_id.map(diet_name)

rc_c['diet_home_id'] = rc_c.diet_home_ticks.map(diet_id)
rc_c['diet_home'] = rc_c.diet_home_id.map(diet_name)

rc_c['diet_main'] = rc_c[['diet_dc_id', 'diet_home_id']].max(axis=1).map(diet_main)

## Caregivers

This data not used yet.

# Participants master

Combining person-level data to a single table without PII.

For dta hygiene, only explicitly selected columns are written out to the master data file on `data/main`.

In [None]:
participants = consent.merge(
    reserch_steps,
    on=['id'],
    how='outer'
).merge(
    rc_c,
    left_on=['id','id_family'],
    right_on=['id_child','id_family'],
    how='outer'
)

In [None]:
participants['age_informed_consent'] = (participants.consent_date - participants.birthday).dt.days

In [None]:
participants['age_blood'] = (pd.to_datetime(participants.date_blood, errors='coerce') - participants.birthday).dt.days

In [None]:
usecols= [
    'id',
    'id_family',
    'age_informed_consent',
    'diet_reported',
    'daycare_municipal',
    'discontinued',
    'dc_id',
    'dc_group_id',
    'age_blood',
    'urine_sample',
    'fecal_sample',
    'sex',
    'age_q',
    'diet_dc_id',
    'diet_dc',
    'diet_home_id',
    'diet_home',
    'diet_main',
]

participants = participants[usecols]

# Food intake records

In [None]:
#full list of food intakes
food_records = pd.read_excel(
    '../data/RAW/aromi_product_ingredient.xlsx', 
    sheet_name='Tutkimusraportti',
).drop(
    columns=[
        'Syntymäaika',
        'Ryhmätagi',
        'Ruokapäiväkirjaryhmä',
        'Ruokapäiväkirjan nimi',
    ]
)

In [None]:
food_records['timestamp'] = pd.to_datetime (
    food_records.Ruokailuaika
)
food_records.drop(columns=['Ruokailuaika'], inplace=True)

food_records['food_code'] = food_records['Tuotetunnus'].fillna(food_records['Reseptin/tuotteen tunnus'])

#manually created list of animal proportion in food items
fap = pd.read_excel(
    '../data/public/food_animal_proportion.xlsx', 
    sheet_name='manual_entry',
    usecols=('food_code','food_animal_proportion')
)

food_records = food_records.merge(
    fap, 
    left_on='food_code', 
    right_on='food_code', 
    how='left'
).copy() #getting rid of fragmentation warning

food_records['ase'] = food_records.ENERC * food_records.food_animal_proportion
len(food_records.index)

In [None]:
intakes_per_person = food_records.groupby(
    ['Tunnus']
).sum(
    numeric_only=True
).drop(
    columns=[
        'Kulutettu määrä',
        'Käyttömäärä',
        'food_animal_proportion'
    ]
)

intakes_per_person['asep'] = intakes_per_person.ase / intakes_per_person.ENERC
len(intakes_per_person)

In [None]:
#calculating foor record time series indicators
df = food_records[['Tunnus','timestamp']].copy()
df['date'] = df.timestamp.dt.date

fr_dates = df.groupby('Tunnus').agg({'timestamp': ['min', 'max'], 'date': 'nunique'})
fr_dates.columns = ('fr_start','fr_end','fr_days')
#fr_dates['fr_duration'] = fr_dates.fr_end - fr_dates.fr_start

intakes_per_person = intakes_per_person.merge(
    fr_dates,
    left_index=True,
    right_index=True,
)
len(intakes_per_person)

In [None]:
#essential micronutrients are total sums, they need to be normalised to daily intakes

#we are divinding the total intake in the food records by the number of distinct days
micro_nutrients = [
    'VITC',
    'F20D5N3',
    'F22D6N3',
    'FE',
    'FOL',
    'CA',
    'F18D2CN6',
    'F18D3N3'
]

for k in micro_nutrients:
    intakes_per_person[f'{k}_daily_mean'] = (intakes_per_person[k] / intakes_per_person.fr_days)

In [None]:
#energy densities of macronutrients

e_densities = {
    'FAT': 37,
    'FAPU': 37,
    'FASAT': 37,
    'CHOAVL': 17,
    'PROT': 17,
    'FIBC': 8,
}

for k,v in e_densities.items():
    # add a column for the total energy from macro nutrient k
    intakes_per_person[f'e-{k}'] = intakes_per_person[k] * v
    # calculcate energy proportion of k
    intakes_per_person[f'ep-{k}'] = intakes_per_person[f'e-{k}'] / intakes_per_person.ENERC


In [None]:
#CHOLE as mg/MJ
intakes_per_person['CHOLE_per_e'] = intakes_per_person.CHOLE / (intakes_per_person.ENERC / 1000) #mg/MJ

In [None]:
intakes_per_person['FIBC_per_e'] = intakes_per_person.FIBC / (intakes_per_person.ENERC / 1000) #g/MJ

# Master data file for analysis

Exporting merged tables as one file.

In [None]:
master_data = participants.merge(
    intakes_per_person,
    left_on='id',
    right_index=True,
)

master_data.to_csv(
    '../data/main/master.csv',
    index=False
)
master_data.to_excel(
    '../data/main/master.xlsx',
    index=False
)