In [1]:
import os, sys, glob
import numpy as np
import pandas as pd
from data import load_mmrf

In [2]:
dset = load_mmrf(fold_span = [1], suffix='_2mos_tr')

loading from: /afs/csail.mit.edu/u/z/zeshanmh/research/ief/data/ml_mmrf/ml_mmrf_v1/cleaned_mm_fold_2mos_tr.pkl


In [3]:
FDIR  = '/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMMpass_IA13_FlatFiles'
data_files = {}
for fullname in glob.glob(FDIR+'/*.csv'):
    print (fullname)
    fname = os.path.basename(fullname).split('.')[0]
    if 'MMRF_CoMMpass_IA13_' in fname:
        kname = fname.split('MMRF_CoMMpass_IA13_')[1]
    else:
        kname = fname
    data_files[kname] = pd.read_csv(fullname, delimiter=',', encoding='latin-1')
print (data_files.keys())

/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMMpass_IA13_FlatFiles/MMRF_CoMMpass_IA13_PER_PATIENT_VISIT.csv


  interactivity=interactivity, compiler=compiler, result=result)


/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMMpass_IA13_FlatFiles/MMRF_CoMMpass_IA13_STAND_ALONE_MEDHX.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMMpass_IA13_FlatFiles/MMRF_CoMMpass_IA13_STAND_ALONE_SURVIVAL.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMMpass_IA13_FlatFiles/MMRF_CoMMpass_IA13_STAND_ALONE_EMERGENCY_DEPT.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMMpass_IA13_FlatFiles/MMRF_CoMMpass_IA13_PER_PATIENT.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMMpass_IA13_FlatFiles/MMRF_CoMMpass_IA13_STAND_ALONE_TRTRESP.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMMpass_IA13_FlatFiles/MMRF_CoMMpass_IA13_STAND_ALONE_AE.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMMpass_IA13_FlatFiles/MMRF_CoMMpass_IA13_STAND_ALONE_FAMHX.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMM

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
N = dset[1]['train']['pids'].shape[0]
np.random.seed(0)
shuf = np.random.permutation(N)

p_idx= shuf[3:6]
pids = dset[1]['train']['pids'][p_idx]
print ('outcome time: ',dset[1]['train']['ys_seq'][p_idx,0])
print ('censorship: ',dset[1]['train']['ce'][p_idx,0])
print ('pat. id:',pids)

outcome time:  [1 0 1]
censorship:  [0. 0. 0.]
pat. id: ['MMRF_1257' 'MMRF_1078' 'MMRF_2144']


In [11]:
# Reference values for lab measurements
# min/max/scale (calculated by attempting to ensure that max labs lie b/w 5-8)
healthy_mins_max = {
    'cbc_abs_neut':(2., 7.5,1/3.), # abs neutrophil count (3.67, 1.), (2.83, 4.51)
    'chem_albumin':(34, 50,1/8.), # chemical albumin (43.62, 2.77), (41.30, 45.94)
    'chem_bun':(2.5, 7.1,1/5.), #BUN # reference range, (4.8, 1.15)
    'chem_calcium':(2.2, 2.7,2.), #Calcium, (2.45, 0.125)
    'chem_creatinine':(66, 112,1/36.), # creatinine, (83., 24.85), (62.22, 103.77)
    'chem_glucose':(3.9, 6.9,1/5.), # glucose, (4.91, 0.40), (4.58, 5.24)
    'cbc_hemoglobin':(13., 17.,1), # hemoglobin (12.90, 15.64), (8.86, 1.02)
    'chem_ldh':(2.33, 4.67,1/3.), #LDH, (3.5, 0.585)
    'serum_m_protein':(0.1, 1.1, 1), # M protein (<3 g/dL is MGUS, any presence of protein is pathological); am just using the data mean/std for this, (0.85, 1.89)
    'urine_24hr_m_protein':(0.0, 0.1, 1), # Urine M protein 
    'cbc_platelet':(150, 400,1/60.), # platelet count (206.42, 334.57), (270.5, 76.63)
    'chem_totprot':(6, 8,1/6.), # total protein, (7, 0.5)
    'urine_24hr_total_protein':(0, 0.23, 1), # 
    'cbc_wbc':(3, 10,1/4.), # WBC  (5.71, 8.44), (7.07, 1.63)
    'serum_iga':(0.85, 4.99, 1.), # IgA, (2.92, 1.035)
    'serum_igg':(6.10, 16.16,1/10.), # IgG, (11.13, 2.515)
    'serum_igm':(0.35, 2.42,1), #IgM, (1.385, 0.518)
    'serum_lambda':(0.57, 2.63, 1/2.), #serum lambda, (1.6, 0.515)
    'serum_kappa':(.33, 1.94,1/8.), #serum kappa , (1.135, 0.403)
    'serum_beta2_microglobulin':(0.7, 1.80, 1/3.), #serum_beta2_microglobulin,
    'serum_c_reactive_protein':(0.0, 1., 1.) #serum_c_reactive_protein,
}

## Step 1: Sanity check that longitudinal labs have been captured correctly for a few patients
* Plot the patient's cleaned longitudinal data against their raw data in the files

In [34]:
df = data_files['PER_PATIENT_VISIT']
lab_names = ['D_LAB_cbc_abs_neut', 'D_LAB_chem_albumin', 'D_LAB_chem_bun', 'D_LAB_chem_calcium', 'D_LAB_chem_creatinine',
        'D_LAB_chem_glucose', 'D_LAB_cbc_hemoglobin', 'D_LAB_serum_kappa', 'D_LAB_chem_ldh', 'D_LAB_serum_m_protein', 'D_LAB_cbc_platelet',
        'D_LAB_chem_totprot', 'D_LAB_cbc_wbc', 'D_LAB_serum_iga', 'D_LAB_serum_igg', 'D_LAB_serum_igm', 'D_LAB_serum_beta2_microglobulin',
        'D_LAB_serum_lambda', 'D_LAB_urine_24hr_m_protein', 'D_LAB_urine_24hr_total_protein',
        'D_LAB_serum_c_reactive_protein']
df = df[['PUBLIC_ID','VISITDY']+lab_names]
# df = df[df['PUBLIC_ID'].isin(pids.ravel())].reset_index(drop=True)
df.rename(columns = dict([(k,k.split('D_LAB_')[1]) for k in df.columns if 'D_LAB' in k]), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [42]:
IDX= 0
pname= pids[IDX]
pidx = p_idx[IDX]
print ('Patient ',pname,pidx)
print(df[df.PUBLIC_ID == 'MMRF_1078'][['PUBLIC_ID','serum_m_protein']])
p0 = df[df.PUBLIC_ID==pname]
print(p0[['PUBLIC_ID', 'VISITDY', 'serum_m_protein']])
for cval in p0.columns[1:]:
    if cval in healthy_mins_max:
        p0.loc[:,cval] = (p0[cval]-healthy_mins_max[cval][1])*healthy_mins_max[cval][2]
p0.loc[:,'VISITDY'] = p0.VISITDY//30
p0

# 859	MMRF_2373	Bor-Len-Dex	PD
# 1401	MMRF_1284	Bor-Len-Dex	PD
# 1567	MMRF_1339	Bor-Dex	PD
# 1705	MMRF_1518	Bor-Len-Dex	PD
# 1740	MMRF_1641	Bor-Cyc-Dex	PD
# 2090	MMRF_2204	Len-Dex	PD
# 2234	MMRF_1671	Bor-Cyc-Dex	PD
# 2452	MMRF_2015	Cyc-Dex	PD
# 2835	MMRF_2401	Bor-Thal-Dex	PD
# 2857	MMRF_2300	Bor-Cyc-Pred-Dex	PD

Patient  MMRF_1257 320
      PUBLIC_ID  serum_m_protein
7713  MMRF_1078              NaN
7714  MMRF_1078              NaN
7715  MMRF_1078             4.27
7716  MMRF_1078             0.17
7717  MMRF_1078             0.08
7718  MMRF_1078             0.00
7719  MMRF_1078             0.00
7720  MMRF_1078             0.00
7721  MMRF_1078             0.00
7722  MMRF_1078             0.00
7723  MMRF_1078              NaN
7724  MMRF_1078             0.00
7725  MMRF_1078              NaN
7726  MMRF_1078             0.00
7727  MMRF_1078             0.00
7728  MMRF_1078             0.00
7729  MMRF_1078              NaN
7730  MMRF_1078              NaN
7731  MMRF_1078              NaN
7732  MMRF_1078              NaN
7733  MMRF_1078              NaN
7734  MMRF_1078             0.00
7735  MMRF_1078              NaN
7736  MMRF_1078             0.00
7737  MMRF_1078             0.00
     PUBLIC_ID  VISITDY  serum_m_protein
899  MMRF_1257      NaN              NaN
900  MMRF_1257    -11.0              

Unnamed: 0,PUBLIC_ID,VISITDY,cbc_abs_neut,chem_albumin,chem_bun,chem_calcium,chem_creatinine,chem_glucose,cbc_hemoglobin,serum_kappa,...,chem_totprot,cbc_wbc,serum_iga,serum_igg,serum_igm,serum_beta2_microglobulin,serum_lambda,urine_24hr_m_protein,urine_24hr_total_protein,serum_c_reactive_protein
899,MMRF_1257,,,,,,,,,,...,,,,,,,,,,
900,MMRF_1257,-1.0,,,,,,,,,...,,,,,,,,,,
901,MMRF_1257,-1.0,-0.4,-2.0,-0.1348,-0.2,-0.311778,-0.544,-11.544,-0.150125,...,0.266667,-0.275,54.81,-1.209,-2.21,1.796667,0.205,,,
902,MMRF_1257,2.0,-0.233333,-1.25,0.2222,-1.0,-1.097556,-0.049,-9.684,-0.119625,...,-0.25,-0.025,-3.81,-1.09,-2.19,0.193333,2.5355,-0.1,-0.12,
903,MMRF_1257,5.0,-1.4,-1.0,-0.5632,-0.8,-0.876556,-0.005,-10.552,-0.14225,...,-0.316667,0.0,-4.62,-0.933,-2.29,0.473333,-1.0035,,-0.155,
904,MMRF_1257,9.0,-1.0,-0.125,-0.2776,-0.65,-0.802889,-0.302,-9.498,-0.114375,...,-0.25,-0.2,-4.87,-1.059,-2.24,0.096667,-0.986,,,
905,MMRF_1257,12.0,-1.0,,,,,,-9.932,-0.1455,...,,-0.525,-4.87,-1.047,-2.18,-0.053333,-0.8035,,,
906,MMRF_1257,15.0,-0.933333,-0.5,-0.1348,-0.75,-1.146667,-0.236,-9.684,-0.125375,...,-0.233333,-0.275,-4.72,-0.93,-1.15,,-0.871,,,
907,MMRF_1257,18.0,-0.966667,-0.5,-0.2776,-0.7,-0.901111,-0.478,-9.684,-0.128125,...,-0.216667,-0.2,-4.64,-0.877,-1.57,,-0.8735,,,
908,MMRF_1257,21.0,-0.8,-0.375,-0.2062,-0.75,-0.876556,-0.368,-9.684,-0.118,...,-0.266667,-0.2,-4.6,-0.926,-1.57,,-0.799,,,


In [14]:
print ('Patient ',dset[1]['train']['pids'][pidx])
for fname, val in zip(dset[1]['train']['feature_names'][3:10],dset[1]['train']['b'][pidx,3:10]):
    print (fname,val)

Patient  MMRF_1078
ecog -0.061402757
serum_beta2_microglobulin -0.16423586
PC1 0.33752716
PC2 -0.6550243
PC3 1.9568697
PC4 -1.3910011
PC5 1.1930251


In [15]:
print ('Patient ',dset[1]['train']['pids'][pidx])
for idx,fname in enumerate(dset[1]['train']['feature_names_x']):
    print (idx, fname,':',)
    X = dset[1]['train']['x'][pidx]
    M = dset[1]['train']['m'][pidx]
    strv = ''
    for t in range(X.shape[0]):
        if M[t,idx] == 1:
            strv +='('+str(t)+','+'%.3f'%X[t,idx]+'), '
    print (strv)

Patient  MMRF_1078
0 cbc_abs_neut :
(0,-1.783), (1,-1.030), (2,-1.837), (4,-1.777), (5,-1.493), (7,-1.627), (9,-1.407), (11,-1.710), (13,-1.310), (15,-1.580), (17,-1.653), (19,-1.590), (21,0.980), (23,0.050), (25,-1.370), (27,-1.617), (29,-1.430), 
1 chem_albumin :
(0,-1.500), (1,-1.250), (2,-1.000), (4,-1.250), (5,-1.000), (7,-0.750), (9,-1.125), (11,-0.875), (13,-0.800), (15,-1.125), (17,-1.125), (19,-1.125), (27,-1.000), (29,-1.250), 
2 chem_bun :
(0,-0.420), (1,0.222), (2,-0.420), (4,-0.420), (5,-0.492), (7,-0.135), (9,-0.135), (11,-0.349), (13,-0.420), (15,-0.349), (17,-0.635), (19,-0.135), (21,-0.777), (27,-0.492), (29,-0.492), 
3 chem_calcium :
(0,-1.100), (1,-0.750), (2,-0.800), (4,-0.900), (5,-0.650), (7,-0.750), (9,-1.000), (11,-0.750), (13,-0.600), (15,-0.800), (17,-0.900), (19,-0.950), (21,-1.500), (27,-0.950), (29,-0.900), 
4 chem_creatinine :
(0,-0.999), (1,-0.778), (2,-0.827), (4,-0.901), (5,-0.606), (7,-1.073), (9,-0.926), (11,-0.852), (13,-1.048), (15,-0.778), (17,-0.9

## Step 2: Sanity check that longitudinal lab values have been captured correctly for a few patients
* Plot the patient's longitudinal treatments against their raw data in the files

In [20]:
df = data_files['STAND_ALONE_TRTRESP']
df = df[['public_id','line','trtshnm','trtstdy','trtendy']]
df = df[df['public_id'].isin(pids.ravel())].reset_index(drop=True)
df.loc[:,'trtstdy'] = df.trtstdy//30
df.loc[:,'trtendy'] = df.trtendy//30
df

Unnamed: 0,public_id,line,trtshnm,trtstdy,trtendy
0,MMRF_1257,1,Bor-Cyc-Dex,0,1
1,MMRF_1257,2,Bor-Cyc-Dex,30,40
2,MMRF_1257,3,Ixa-Len-Dex,41,47
3,MMRF_1257,4,Car-Pom-Dex,47,48
4,MMRF_1257,5,Thal-Dar-Pred-Dex,48,51
5,MMRF_1257,5,Thal-Cyc-Dar-Pred-Dex,51,52
6,MMRF_1257,5,Pom-Thal-Cyc-Dar-Pred-Dex,52,52
7,MMRF_1257,9,Car-Pom-Pemb-Bend-Dex,52,55
8,MMRF_2144,1,Bor-Cyc-Dex,0,2
9,MMRF_2144,1,Len,8,30


In [25]:
IDX= 0
pname= pids[IDX]
pidx = p_idx[IDX]
print ('Patient ',pname,pidx)

Patient  MMRF_1257 320


In [26]:
print ('Patient ',dset[1]['train']['pids'][pidx])
for idx,fname in enumerate(dset[1]['train']['feature_names_a']):
    print (idx, fname,':',)
    X = dset[1]['train']['a'][pidx]
    M = dset[1]['train']['m_a'][pidx]
    strv = ''
    for t in range(X.shape[0]):
        if M[t,idx] == 1:
            strv +='('+str(t)+','+'%.3f'%X[t,idx]+'), '
    print (strv)

Patient  MMRF_1257
0 time :
(0,0.100), (15,0.100), (16,0.200), (17,0.300), (18,0.400), (19,0.500), (20,0.100), (21,0.200), (22,0.300), (23,0.400), (24,0.500), (25,0.600), (26,0.700), (27,0.000), 
1 Bor :
(0,1.000), (15,1.000), (16,1.000), (17,1.000), (18,1.000), (19,1.000), (20,0.000), (21,0.000), (22,0.000), (23,0.000), (24,0.000), (25,0.000), (26,0.000), (27,0.000), 
2 Car :
(0,0.000), (15,0.000), (16,0.000), (17,0.000), (18,0.000), (19,0.000), (20,0.000), (21,0.000), (22,0.000), (23,1.000), (24,0.000), (25,0.000), (26,1.000), (27,1.000), 
3 Cyc :
(0,1.000), (15,1.000), (16,1.000), (17,1.000), (18,1.000), (19,1.000), (20,0.000), (21,0.000), (22,0.000), (23,0.000), (24,0.000), (25,1.000), (26,0.000), (27,0.000), 
4 Dex :
(0,1.000), (15,1.000), (16,1.000), (17,1.000), (18,1.000), (19,1.000), (20,1.000), (21,1.000), (22,1.000), (23,1.000), (24,1.000), (25,1.000), (26,1.000), (27,1.000), 
5 Len :
(0,0.000), (15,0.000), (16,0.000), (17,0.000), (18,0.000), (19,0.000), (20,1.000), (21,1.000

IndexError: index 6 is out of bounds for axis 1 with size 6

## Step 3: Sanity check that outcomes have been correctly captured

In [13]:
df = data_files['PER_PATIENT']
df = df[['PUBLIC_ID','D_PT_lstalive', 'D_PT_lastdy', 'D_PT_deathdy']]
df = df[df['PUBLIC_ID'].isin(pids.ravel().tolist())].reset_index(drop=True)
df.loc[:,'D_PT_lastdy'] = df.D_PT_lastdy/30.
df.loc[:,'D_PT_deathdy'] = df.D_PT_deathdy/30.
df.loc[:,'D_PT_lstalive'] = df.D_PT_lstalive/30.
df

Unnamed: 0,PUBLIC_ID,D_PT_lstalive,D_PT_lastdy,D_PT_deathdy
0,MMRF_1016,21.566667,23.066667,23.066667
1,MMRF_1260,35.266667,36.666667,36.666667
2,MMRF_1090,17.533333,22.7,22.7


In [14]:
pids.ravel()

array(['MMRF_1260', 'MMRF_1016', 'MMRF_1090'], dtype=object)

In [15]:
for IDX in range(3):
    pname= pids[IDX]
    pidx = p_idx[IDX]
    print ('Patient ',dset[1]['train']['pids'][pidx])
    print (dset[1]['train']['ys_seq'][pidx])
    print (dset[1]['train']['ce'][pidx])

Patient  MMRF_1260
[36.666666666666664]
[0.0]
Patient  MMRF_1016
[23.066666666666666]
[0.0]
Patient  MMRF_1090
[22.7]
[0.0]


In [16]:
if not os.path.exists('mmrf_csv'):
    os.mkdir('mmrf_csv')
dset = load_mmrf(fold_span = range(5))

loading from: /afs/csail.mit.edu/group/clinicalml/users/rahulgk/ml_mmrf/ml_mmrf_v1/cleaned_mm_fold.pkl


In [5]:
trt_df = data_files['STAND_ALONE_TRTRESP']
temp  = trt_df[(trt_df['line'] == 1) & (trt_df['trtstdy'] == trt_df['therstdy']) & (trt_df['bestrespsh'].notna())]
bresp = temp[['public_id', 'trtshnm', 'bestrespsh']]
print(bresp)

      public_id        trtshnm bestrespsh
0     MMRF_1014        Bor-Dex         PR
14    MMRF_1017        Len-Dex       VGPR
15    MMRF_1024  Thal-Mel-Pred         SD
19    MMRF_1038        Len-Dex         PR
20    MMRF_1033        Len-Dex         PR
...         ...            ...        ...
3249  MMRF_2843    Car-Len-Dex         PR
3252  MMRF_2847    Car-Len-Dex         PR
3254  MMRF_2848    Car-Len-Dex       VGPR
3256  MMRF_2851    Car-Cyc-Dex       VGPR
3257  MMRF_2853    Car-Len-Dex       VGPR

[1074 rows x 3 columns]


In [39]:
bresp[(bresp['public_id'] == pids[0]) | (bresp['public_id'] == pids[1]) | (bresp['public_id'] == pids[2])]
# bresp[(bresp['bestrespsh'] == 'PD')]

Unnamed: 0,public_id,trtshnm,bestrespsh
172,MMRF_1257,Bor-Cyc-Dex,VGPR
823,MMRF_2144,Bor-Cyc-Dex,VGPR
1619,MMRF_1078,Bor-Len-Dex,CR


In [None]:
bresp['public_id'] == 'MMRF_2144'