In [1]:
import pandas as pd

# local file
from src.util.data_handling.data_loader import save_as_pickle 

# IBD Data

In [2]:
def process_ibd(ibd_data_path, ibd_metadata_path):
    
    good_metadata = [
        'sample',
        'Participant ID',
        # 'Project',
        # 'External ID',
        'date_of_receipt',
        # 'ProjectSpecificID',
        'visit_num',
        'site_name',
        'consent_age', 
        'diagnosis', # UC = ulcerative colitis, CD = Crohn disease; https://emedicine.medscape.com/article/179037-overview
        'hbi', # Harvey-Bradshaw Index; https://globalrph.com/medcalcs/harvey-bradshaw-index-measuring-crohns-disease/
        'sex',
        'race',
        'fecalcal', # Fecal Calprotectin Test; https://www.verywellhealth.com/how-the-fecal-calprotectin-test-is-used-in-ibd-4140079
        'sccai', # Simple clinical colitis activity index; https://en.wikipedia.org/wiki/Simple_clinical_colitis_activity_index
        ]
    
    # load data
    ibd_data = pd.read_csv(ibd_data_path)
    ibd_metadata = pd.read_csv(ibd_metadata_path)
    
    # remove participants who did not complete the study
    mask = ibd_metadata['Did the subject withdraw from the study?'] == 'No'
    ibd_data = ibd_data[mask]
    ibd_metadata = ibd_metadata[mask]
    
    # remove nans
    ibd_metadata = ibd_metadata.fillna(0)
    
    # for ibd metadata, rename and drop uncessary columns
    ibd_metadata = ibd_metadata[good_metadata]
    ibd_metadata = ibd_metadata.rename(
        columns={
            'sample': 'Sample',
            'site_name': 'Hospital',
            'consent_age': 'Age',
            'Participant ID': 'Participant',
            'date_of_receipt': 'Sample Collection Date',
            'visit_num': 'Visit Number',
            'hbi': 'HBI',
            'diagnosis': 'Diagnosis',
            'sex': 'Sex',
            'race': 'Race',
            'fecalcal': 'Fecalcal',
            'sccai': 'SCCAI'
            }
        )
    
    # for ibd data, rename and drop uncessary columns
    ibd_data.rename(columns={'sample': 'Sample'}, inplace=True) 
    ibd_data = ibd_data.drop(columns=['site', 'patient', 'visit'])
    
    # make sample id the index
    ibd_data = ibd_data.set_index('Sample')
    ibd_metadata = ibd_metadata.set_index('Sample')
    
    return ibd_data, ibd_metadata

# T2D Data

In [3]:
def process_t2d(t2d_data_path, t2d_metadata_path):
    
    bad_metadata = [
        'Event',
        'Event_Note1',
        'Event_Note2',
        'Event_Note3',
        'SubStudy',
        'Study'
    ]

    # read files
    t2d_data = pd.read_csv(t2d_data_path)
    t2d_metadata = pd.read_csv(t2d_metadata_path)
    
    # # remove duplicates
    # duplicate_idxs = t2d_metadata.duplicated(keep='last')
    # t2d_metadata = t2d_metadata[duplicate_idxs]
    # t2d_metadata = t2d_metadata.reindex(labels=np.arange(len(t2d_metadata)))
    # t2d_data = t2d_data[duplicate_idxs]
    # t2d_data = t2d_data.reindex(labels=np.arange(len(t2d_data)))
    
    # create sample id from patient id and visit number
    t2d_data['Sample'] = t2d_data['patient'].astype(str) + '-' + t2d_data['visit'].astype(str)
    t2d_data = t2d_data.drop(columns=['patient', 'visit', 'site', 'sample'])
    
    # binarize the event column into healthy and infected
    t2d_metadata['Healthy'] = t2d_metadata['Event'] == 'Healthy'

    # rename columns and drop bad columns
    t2d_metadata = t2d_metadata.rename(
        columns={
            'VisitID': 'Sample', 
            'SubjectID': 'Participant',
            'CollectionDate': 'Date (since trial start)',
            'IR_IS_classification': 'Diabetes Class'
            }
        ) 
    t2d_metadata = t2d_metadata.drop(columns=bad_metadata)
    
    # make sample id the index
    t2d_data = t2d_data.set_index('Sample')
    t2d_metadata = t2d_metadata.set_index('Sample')
    
    return t2d_data, t2d_metadata

# MOMS

In [4]:
def process_moms(moms_data_path, moms_metadata_path):
    
    good_metadata = [
        'sample_id',
        'sample_body_site',
        'subject_id',
        'visit_number',
        'subject_gender'
        ]

    # load files
    moms_data = pd.read_csv(moms_data_path)
    moms_metadata = pd.read_csv(moms_metadata_path)
    
    # in metadata, select good metadata and rename columns
    moms_metadata = moms_metadata[good_metadata] 
    moms_metadata = moms_metadata.rename(
        columns={
            'subject_id': 'Patient',
            'sample_body_site': 'Sample Site',
            'visit_number': 'Visit Number',
            'subject_gender': 'Sex',
            'sample_id': 'Sample'
        }
    )

    # in data, drop bad columns and insert sample id columns    
    moms_data = moms_data.drop(columns=['site', 'patient', 'visit', 'sample'])
    moms_data['Sample'] = moms_metadata['Sample']
    
    # captialize values in metadata Sex column
    moms_metadata['Sex'] = moms_metadata['Sex'].str.capitalize()
    
    # make sample id the index
    moms_data = moms_data.set_index('Sample')
    moms_metadata = moms_metadata.set_index('Sample')
    
    return moms_data, moms_metadata

# Save Data & Metadata

In [5]:
def main(ihmp_dir, outdir):
    
    # get file paths
    ibd_data_path = ihmp_dir + '/ibd_data.csv'
    ibd_metadata_path = ihmp_dir + '/ibd_metadata.csv'
    t2d_data_path = ihmp_dir + '/t2d_data.csv'
    t2d_metadata_path = ihmp_dir + '/t2d_metadata.csv'
    moms_data_path = ihmp_dir + '/moms_data.csv'
    moms_metadata_path = ihmp_dir + '/moms_metadata.csv'
    
    # process the data
    print('Processing ibd data...', end='\t\t')
    ibd_data, ibd_metadata = process_ibd(ibd_data_path, ibd_metadata_path)
    print('Done.')
    
    print('Processing t2d data...', end='\t\t')
    t2d_data, t2d_metadata = process_t2d(t2d_data_path, t2d_metadata_path)
    print('Done.')
    
    print('Processing moms data...', end='\t\t')
    moms_data, moms_metadata = process_moms(moms_data_path, moms_metadata_path)
    print('Done.')
    
    # save the data
    name_to_df = {
        'ibd_data': ibd_data, 
        'ibd_metadata': ibd_metadata,
        't2d_data': t2d_data, 
        't2d_metadata': t2d_metadata,
        'moms_data': moms_data, 
        'moms_metadata': moms_metadata
    }
    for name, df in name_to_df.items():
        path = '{}/{}.csv'.format(outdir, name)
        df.to_csv(path)
        
    return name_to_df

In [6]:
IHMP_DIR = '../data/raw/ihmp'
OUTDIR = '../data/interim/ihmp'

name_to_df = main(IHMP_DIR, OUTDIR)

Processing ibd data...		Done.
Processing t2d data...		Done.
Processing moms data...		Done.
