In [1]:
import numpy as np
import pandas as pd

In [2]:
normalize_ct = False

In [3]:
# Normalizing EDP CT

# Normalize at each site separately or not
separate_sites = False

def normalize_ct_site(site):
    cond = edp_data.SiteID == site
    ct_mean = edp_data.loc[cond, 'CT'].mean()
    ct_std = edp_data.loc[cond, 'CT'].std()
    edp_data.loc[cond, 'CT'] = (edp_data['CT'] - ct_mean) / ct_std
    print("EDP CT", site, ct_mean, ct_std)

def normalize_ct_site_lb():
    cond = (edp_data.SiteID == '1-Bong') | (edp_data.SiteID == '3-Margibi')
    ct_mean = edp_data.loc[cond, 'CT'].mean()
    ct_std = edp_data.loc[cond, 'CT'].std()
    edp_data.loc[cond, 'CT'] = (edp_data['CT'] - ct_mean) / ct_std
    print("EDP CT Liberia", ct_mean, ct_std)
    
def normalize_ct_site_sl():
    cond = (edp_data.SiteID == '2-Lunsar') | (edp_data.SiteID == '4-Kambia') | (edp_data.SiteID == '5-Makeni')
    ct_mean = edp_data.loc[cond, 'CT'].mean()
    ct_std = edp_data.loc[cond, 'CT'].std()
    edp_data.loc[cond, 'CT'] = (edp_data['CT'] - ct_mean) / ct_std
    print("EDP CT Sierra Leone", ct_mean, ct_std)        
    
edp_data = pd.read_csv('edp-data-orig.csv' , na_values="\\N")

edp_ct_mean = edp_data['CT'].mean()
edp_ct_std = edp_data['CT'].std()    
print("EDP overall CT", edp_ct_mean, edp_ct_std)

if normalize_ct:
    if separate_sites:
        # Normalize each non-IMC ETUs separately
        normalize_ct_site('CONAKRY')
        normalize_ct_site('CTE-NZEREKORE')
        normalize_ct_site('ELWA3')
        normalize_ct_site('MSF_Bo')

        # IMC ETUs

        # Information about PCR assays used in Liberia: 2 Ebola virus disease gene targets, EBOV Zaire locus and minor groove binding locus)
        # Information about PCR assays used in Sierra Leone: Lunsar and Makeni tested a single EVD gene target, EBOV Zaire nucleoprotein region, while Kambia used two targets, 
        # including the EBOV Zaire target and the minor groove binding target)

        # Normalize Liberia ETUs together
        normalize_ct_site_lb()

        # ... or separately?
        # normalize_ct_site('1-Bong')
        # normalize_ct_site('3-Margibi')

        # Normalize Sierra Leone ETUs together
        normalize_ct_site_sl()

        # ... or separately?    
        # normalize_ct_site('2-Lunsar')
        # normalize_ct_site('4-Kambia')
        # normalize_ct_site('5-Makeni')    
    else:    
        edp_data['CT'] = (edp_data['CT'] - edp_ct_mean) / edp_ct_std

EDP overall CT 25.59675410347542 5.792205411383621


In [4]:
# Normalizing DRC CTs

drc_data = pd.read_csv('drc-data-orig.csv')

drc_ctgp_mean = drc_data['Lab.CtGP.D1'].mean()
drc_ctgp_std = drc_data['Lab.CtGP.D1'].std()

drc_ctnp_mean = drc_data['Lab.CtNP.D1'].mean()
drc_ctnp_std = drc_data['Lab.CtNP.D1'].std()

print("DRC CtGP", drc_ctgp_mean, drc_ctgp_std)
print("DRC CtNP", drc_ctnp_mean, drc_ctnp_std)

if normalize_ct:
    drc_data['Lab.CtGP.D1'] = (drc_data['Lab.CtGP.D1'] - drc_ctgp_mean) / drc_ctgp_std
    drc_data['Lab.CtNP.D1'] = (drc_data['Lab.CtNP.D1'] - drc_ctnp_mean) / drc_ctnp_std
    
drc_data['CT'] = drc_data['Lab.CtNP.D1']

DRC CtGP 25.11208791208792 6.278337273953904
DRC CtNP 20.73076923076923 6.031264697072565


In [5]:
# Inverting outcome value in DRC data (1 means survival, 0 meand death, opposite is needed in prog model)
drc_data['S7.FinalStatus'] = 1 - drc_data['S7.FinalStatus']

In [6]:
# Saving data files
edp_data.to_csv('edp-data.csv', index=False, na_rep="\\N")
drc_data.to_csv('drc-data.csv', index=False, na_rep="\\N")

In [7]:
edp_data

Unnamed: 0,SiteID,LOS,Death,PatientAge,PatientSex,CT,AbdominalPain,Anorexia,AnyBleeding,JointPain,...,Vomit,Diarrhoea,Breathlessness,Headache,SwallowingProblems,Fever,Hiccups,Nausea,Conjunctivitis,Malaria
0,CONAKRY,12.0,0,4.00,1,,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,,
1,CONAKRY,6.0,1,3.00,1,,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,
2,CONAKRY,13.0,0,15.00,0,,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,,
3,CONAKRY,14.0,0,16.00,1,,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,,
4,CONAKRY,6.0,1,2.00,0,,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
574,5-Makeni,2.0,1,13.00,0,18.6,0.0,,1.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
575,5-Makeni,1.0,1,10.00,1,,0.0,,0.0,,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,,
576,5-Makeni,11.0,1,0.58,0,,0.0,,0.0,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,
577,5-Makeni,2.0,1,16.00,0,17.1,1.0,,1.0,,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,,0.0


In [8]:
drc_data

Unnamed: 0,AD.PatientID,AD.Age,AD.Sex,AD.ResidenceHealthZone,AD.Date,AD.SympDate,AD.Condition,AD.Pregnant,AD.Breastfeeding,AD.PregTest,...,Lab.MXD%_visit6,Lab.NEUT%_visit6,Lab.LYM#_visit6,Lab.MXD#_visit6,Lab.NEUT#_visit6,Lab.RDWSD_visit6,Lab.RDWCV_visit6,Lab.MPV_visit6,LOS_bin,CT
0,122,13.000,0,Mabalako,17-Dec-18,10-Dec-18,1.0,0,,,...,,,,,,,,,>48h,27.2
1,295,14.000,0,Mandima,28-Feb-19,22-Feb-19,0.0,0,,,...,,,,,,,,,>48h,18.5
2,299,5.000,0,Mandima,2-Mar-19,26-Feb-19,0.0,0,,,...,,,,,,,,,>48h,18.1
3,314,10.000,1,Mabalako,7-Dec-18,1-Dec-18,0.0,0,,,...,,,,,,,,,<24h,15.8
4,318,0.250,0,Mandima,19-Mar-19,18-Mar-19,0.0,0,,,...,,,,,,,,,>48h,18.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,2860,0.167,1,Mabalako,8-Dec-19,6-Dec-19,,0,,,...,,,,,,,,,>48h,22.1
94,2880,1.000,0,Mabalako,9-Dec-19,7-Dec-19,,0,,,...,,,,,,,,,>48h,18.5
95,3066,13.000,1,Mabalako,22-Dec-19,19-Dec-19,0.0,0,,,...,,,,,,,,,<24h,16.6
96,3091,0.580,1,Mabalako,24-Dec-19,17-Dec-19,1.0,0,,,...,,,,,,,,,24-48h,19.1
