In [1]:
import os, sys, glob
import numpy as np
import pandas as pd
from data import load_mmrf

In [None]:
dset = load_mmrf(fold_span = [1], suffix='_2mos')

In [3]:
FDIR  = '/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles'
data_files = {}
for fullname in glob.glob(FDIR+'/*.csv'):
    print (fullname)
    fname = os.path.basename(fullname).split('.')[0]
    if 'MMRF_CoMMpass_IA15_' in fname:
        kname = fname.split('MMRF_CoMMpass_IA15_')[1]
    else:
        kname = fname
    data_files[kname] = pd.read_csv(fullname, delimiter=',', encoding='latin-1')
print (data_files.keys())

/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles/MMRF_CoMMpass_IA15_STAND_ALONE_EMERGENCY_DEPT.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles/MMRF_CoMMpass_IA15_PER_PATIENT_VISIT.csv


  interactivity=interactivity, compiler=compiler, result=result)


/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles/MMRF_CoMMpass_IA15_STAND_ALONE_TRTRESP.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles/MMRF_CoMMpass_IA15_STAND_ALONE_TREATMENT_REGIMEN.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles/MMRF_CoMMpass_IA15_STAND_ALONE_AE.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles/MMRF_CoMMpass_IA15_STAND_ALONE_FAMHX.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles/MMRF_CoMMpass_IA15_STAND_ALONE_MEDHX.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles/MMRF_CoMMpass_IA15_STAND_ALONE_ADMISSIONS.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles/MMRF_CoMMpass_IA15_STAND_ALONE_SURVIVAL.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_mye

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
N = dset[1]['train']['pids'].shape[0]
np.random.seed(0)
shuf = np.random.permutation(N)

p_idx= shuf[3:6]
pids = dset[1]['train']['pids'][p_idx]
print ('outcome time: ',dset[1]['train']['ys_seq'][p_idx,0])
print ('censorship: ',dset[1]['train']['ce'][p_idx,0])
print ('pat. id:',pids)

outcome time:  [0 0 0]
censorship:  [0. 0. 0.]
pat. id: ['MMRF_1921' 'MMRF_1099' 'MMRF_1143']


In [5]:
# Reference values for lab measurements
# min/max/scale (calculated by attempting to ensure that max labs lie b/w 5-8)
healthy_mins_max = {
    'cbc_abs_neut':(2., 7.5,1/3.), # abs neutrophil count (3.67, 1.), (2.83, 4.51)
    'chem_albumin':(34, 50,1/8.), # chemical albumin (43.62, 2.77), (41.30, 45.94)
    'chem_bun':(2.5, 7.1,1/5.), #BUN # reference range, (4.8, 1.15)
    'chem_calcium':(2.2, 2.7,2.), #Calcium, (2.45, 0.125)
    'chem_creatinine':(66, 112,1/36.), # creatinine, (83., 24.85), (62.22, 103.77)
    'chem_glucose':(3.9, 6.9,1/5.), # glucose, (4.91, 0.40), (4.58, 5.24)
    'cbc_hemoglobin':(13., 17.,1), # hemoglobin (12.90, 15.64), (8.86, 1.02)
    'chem_ldh':(2.33, 4.67,1/3.), #LDH, (3.5, 0.585)
    'serum_m_protein':(0.1, 1.1, 1), # M protein (<3 g/dL is MGUS, any presence of protein is pathological); am just using the data mean/std for this, (0.85, 1.89)
    'urine_24hr_m_protein':(0.0, 0.1, 1), # Urine M protein 
    'cbc_platelet':(150, 400,1/60.), # platelet count (206.42, 334.57), (270.5, 76.63)
    'chem_totprot':(6, 8,1/6.), # total protein, (7, 0.5)
    'urine_24hr_total_protein':(0, 0.23, 1), # 
    'cbc_wbc':(3, 10,1/4.), # WBC  (5.71, 8.44), (7.07, 1.63)
    'serum_iga':(0.85, 4.99, 1.), # IgA, (2.92, 1.035)
    'serum_igg':(6.10, 16.16,1/10.), # IgG, (11.13, 2.515)
    'serum_igm':(0.35, 2.42,1), #IgM, (1.385, 0.518)
    'serum_lambda':(0.57, 2.63, 1/2.), #serum lambda, (1.6, 0.515)
    'serum_kappa':(.33, 1.94,1/8.), #serum kappa , (1.135, 0.403)
    'serum_beta2_microglobulin':(0.7, 1.80, 1/3.), #serum_beta2_microglobulin,
    'serum_c_reactive_protein':(0.0, 1., 1.) #serum_c_reactive_protein,
}

## Step 1: Sanity check that longitudinal labs have been captured correctly for a few patients
* Plot the patient's cleaned longitudinal data against their raw data in the files

In [6]:
df = data_files['PER_PATIENT_VISIT']
lab_names = ['D_LAB_cbc_abs_neut', 'D_LAB_chem_albumin', 'D_LAB_chem_bun', 'D_LAB_chem_calcium', 'D_LAB_chem_creatinine',
        'D_LAB_chem_glucose', 'D_LAB_cbc_hemoglobin', 'D_LAB_serum_kappa', 'D_LAB_chem_ldh', 'D_LAB_serum_m_protein', 'D_LAB_cbc_platelet',
        'D_LAB_chem_totprot', 'D_LAB_cbc_wbc', 'D_LAB_serum_iga', 'D_LAB_serum_igg', 'D_LAB_serum_igm', 'D_LAB_serum_beta2_microglobulin',
        'D_LAB_serum_lambda', 'D_LAB_urine_24hr_m_protein', 'D_LAB_urine_24hr_total_protein',
        'D_LAB_serum_c_reactive_protein']
df = df[['PUBLIC_ID','VISITDY']+lab_names]
# df = df[df['PUBLIC_ID'].isin(pids.ravel())].reset_index(drop=True)
df.rename(columns = dict([(k,k.split('D_LAB_')[1]) for k in df.columns if 'D_LAB' in k]), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [7]:
IDX= 0
pname= pids[IDX]
pidx = p_idx[IDX]
print ('Patient ',pname,pidx)
print(df[df.PUBLIC_ID == 'MMRF_1078'][['PUBLIC_ID','serum_m_protein']])
p0 = df[df.PUBLIC_ID==pname]
print(p0[['PUBLIC_ID', 'VISITDY', 'serum_m_protein']])
for cval in p0.columns[1:]:
    if cval in healthy_mins_max:
        p0.loc[:,cval] = (p0[cval]-healthy_mins_max[cval][1])*healthy_mins_max[cval][2]
p0.loc[:,'VISITDY'] = p0.VISITDY//30
p0

Patient  MMRF_1921 116
      PUBLIC_ID  serum_m_protein
8757  MMRF_1078              NaN
8758  MMRF_1078              NaN
8759  MMRF_1078             4.27
8760  MMRF_1078             0.17
8761  MMRF_1078             0.08
8762  MMRF_1078             0.00
8763  MMRF_1078             0.00
8764  MMRF_1078             0.00
8765  MMRF_1078             0.00
8766  MMRF_1078             0.00
8767  MMRF_1078              NaN
8768  MMRF_1078             0.00
8769  MMRF_1078              NaN
8770  MMRF_1078             0.00
8771  MMRF_1078             0.00
8772  MMRF_1078             0.00
8773  MMRF_1078              NaN
8774  MMRF_1078              NaN
8775  MMRF_1078              NaN
8776  MMRF_1078              NaN
8777  MMRF_1078              NaN
8778  MMRF_1078             0.00
8779  MMRF_1078              NaN
8780  MMRF_1078             0.00
8781  MMRF_1078             0.00
8782  MMRF_1078             0.00
8783  MMRF_1078             0.00
8784  MMRF_1078             0.13
8785  MMRF_1078     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,PUBLIC_ID,VISITDY,cbc_abs_neut,chem_albumin,chem_bun,chem_calcium,chem_creatinine,chem_glucose,cbc_hemoglobin,serum_kappa,...,chem_totprot,cbc_wbc,serum_iga,serum_igg,serum_igm,serum_beta2_microglobulin,serum_lambda,urine_24hr_m_protein,urine_24hr_total_protein,serum_c_reactive_protein
4172,MMRF_1921,-1.0,,,,,,,,,...,,,,,,,,,,
4173,MMRF_1921,-1.0,-1.233333,-1.375,-0.2776,-0.65,-0.729222,-0.203,-9.126,-0.01925,...,0.35,-0.925,-3.79,2.538,-1.53,0.266667,-0.846,-0.1,-0.23,-0.93
4174,MMRF_1921,3.0,-1.1,-1.75,-0.1348,-0.85,-1.220333,-0.357,-9.498,-0.17975,...,-0.183333,-1.025,-4.48,0.068,-1.81,,-1.275,,,
4175,MMRF_1921,6.0,-1.466667,-1.625,-0.5632,-0.85,-1.244889,-0.291,-9.684,-0.165375,...,-0.183333,-1.45,-4.45,0.08,-1.67,,-1.068,,,
4176,MMRF_1921,8.0,-0.953333,-1.25,-0.706,-0.75,-1.122111,-0.423,-10.614,-0.15625,...,-0.166667,-0.5,-4.01,-0.382,-1.42,,-0.8605,,,
4177,MMRF_1921,11.0,-1.233333,-1.0,-0.5632,-0.7,-0.999333,-0.346,-8.382,-0.1395,...,-0.133333,-1.0,-4.0,-0.388,-1.97,-0.046667,-0.7825,-0.1,-0.1475,
4178,MMRF_1921,14.0,-1.333333,-0.875,-0.6346,-0.65,-0.778333,-0.258,-8.072,-0.110375,...,-0.183333,-0.875,-3.17,-0.546,-1.35,,-0.6975,,,
4179,MMRF_1921,18.0,-1.366667,-0.875,-0.1348,-0.6,-0.925667,-0.225,-8.692,-0.119125,...,-0.133333,-1.225,-3.83,-0.619,-2.07,,-1.025,,,
4180,MMRF_1921,20.0,-0.833333,,-0.1348,-0.8,-1.195778,-0.467,-8.63,-0.091625,...,,-0.7,-3.68,-0.536,-1.98,,-1.016,,,
4181,MMRF_1921,23.0,-1.533333,-1.0,-0.4204,-0.7,-1.097556,-0.236,-8.63,-0.11125,...,-0.15,-1.375,-3.55,-0.477,-1.93,,-0.851,,,


In [8]:
print ('Patient ',dset[1]['train']['pids'][pidx])
for fname, val in zip(dset[1]['train']['feature_names'][3:10],dset[1]['train']['b'][pidx,3:10]):
    print (fname,val)

Patient  MMRF_1921
ecog 0.005377392
serum_beta2_microglobulin -0.18524218
PC1 -0.58499736
PC2 -1.0959584
PC3 -1.7064408
PC4 -0.30424118
PC5 -0.61856306


In [9]:
print ('Patient ',dset[1]['train']['pids'][pidx])
for idx,fname in enumerate(dset[1]['train']['feature_names_x']):
    print (idx, fname,':',)
    X = dset[1]['train']['x'][pidx]
    M = dset[1]['train']['m'][pidx]
    strv = ''
    for t in range(X.shape[0]):
        if M[t,idx] == 1:
            strv +='('+str(t)+','+'%.3f'%X[t,idx]+'), '
    print (strv)

Patient  MMRF_1921
0 cbc_abs_neut :
(0,-1.233), (1,-1.100), (3,-1.467), (4,-0.953), (5,-1.233), (7,-1.333), (9,-1.367), (10,-0.833), (11,-1.533), (13,-1.233), (15,-1.300), (16,-1.333), (18,-1.267), (19,-1.100), (21,-1.133), (22,-1.533), (23,0.067), (24,-0.933), (26,-1.067), 
1 chem_albumin :
(0,-1.375), (1,-1.750), (3,-1.625), (4,-1.250), (5,-1.000), (7,-0.875), (9,-0.875), (11,-1.000), (13,-1.250), (15,-0.500), (16,-1.000), (18,-0.750), (19,-1.250), (21,-1.375), (22,-1.250), (23,-1.125), (24,-1.250), (26,-0.750), 
2 chem_bun :
(0,-0.278), (1,-0.135), (3,-0.563), (4,-0.706), (5,-0.563), (7,-0.635), (9,-0.135), (10,-0.135), (11,-0.420), (13,-0.420), (15,-0.135), (16,-0.349), (18,-0.563), (19,-0.635), (21,0.222), (22,-0.135), (23,-0.706), (24,-0.278), (26,-0.635), 
3 chem_calcium :
(0,-0.650), (1,-0.850), (3,-0.850), (4,-0.750), (5,-0.700), (7,-0.650), (9,-0.600), (10,-0.800), (11,-0.700), (13,-0.750), (15,-0.550), (16,-0.500), (18,-0.800), (19,-0.600), (21,-0.750), (22,-0.800), (23,-1.0

## Step 2: Sanity check that longitudinal lab values have been captured correctly for a few patients
* Plot the patient's longitudinal treatments against their raw data in the files

In [13]:
df = data_files['STAND_ALONE_TRTRESP']
df = df[['public_id','line','trtshnm','trtstdy','trtendy']]
df = df[df['public_id'].isin(pids.ravel())].reset_index(drop=True)
df.loc[:,'trtstdy'] = df.trtstdy//60
df.loc[:,'trtendy'] = df.trtendy//60
df

Unnamed: 0,public_id,line,trtshnm,trtstdy,trtendy
0,MMRF_1099,1,Bor-Len-Dex,0,4
1,MMRF_1099,2,Pom-Dex,5,9
2,MMRF_1921,1,Bor-Len-Dex,0,0
3,MMRF_1921,1,Bor-Len-Cyc-Dex,0,3
4,MMRF_1921,1,Bor-Len-Dex,3,3
5,MMRF_1921,1,Bor-Len-Mel-Dex,3,3
6,MMRF_1921,1,Bor-Len-Dex,3,12
7,MMRF_1921,1,Ixa,12,21
8,MMRF_1921,2,Car-ISATUXIMAB-Dex,22,26
9,MMRF_1143,1,Len-Dex,0,3


In [11]:
IDX= 0
pname= pids[IDX]
pidx = p_idx[IDX]
print ('Patient ',pname,pidx)

Patient  MMRF_1921 116


In [12]:
print ('Patient ',dset[1]['train']['pids'][pidx])
for idx,fname in enumerate(dset[1]['train']['feature_names_a']):
    print (idx, fname,':',)
    X = dset[1]['train']['a'][pidx]
    M = dset[1]['train']['m_a'][pidx]
    strv = ''
    for t in range(X.shape[0]):
        if M[t,idx] == 1:
            strv +='('+str(t)+','+'%.3f'%X[t,idx]+'), '
    print (strv)

Patient  MMRF_1921
0 local_clock :
(0,0.100), (1,0.200), (2,0.300), (3,0.400), (4,0.500), (5,0.600), (6,0.700), (7,0.800), (8,0.900), (9,1.000), (10,1.100), (11,1.200), (12,1.300), (13,1.400), (14,1.500), (15,1.600), (16,1.700), (17,1.800), (18,1.900), (19,2.000), (20,2.100), (21,2.200), (22,0.100), (23,0.200), (24,0.300), (25,0.400), (26,0.500), 
1 Bor :
(0,1.000), (1,1.000), (2,1.000), (3,1.000), (4,1.000), (5,1.000), (6,1.000), (7,1.000), (8,1.000), (9,1.000), (10,1.000), (11,1.000), (12,0.000), (13,0.000), (14,0.000), (15,0.000), (16,0.000), (17,0.000), (18,0.000), (19,0.000), (20,0.000), (21,0.000), (22,0.000), (23,0.000), (24,0.000), (25,0.000), (26,0.000), 
2 Car :
(0,0.000), (1,0.000), (2,0.000), (3,0.000), (4,0.000), (5,0.000), (6,0.000), (7,0.000), (8,0.000), (9,0.000), (10,0.000), (11,0.000), (12,0.000), (13,0.000), (14,0.000), (15,0.000), (16,0.000), (17,0.000), (18,0.000), (19,0.000), (20,0.000), (21,0.000), (22,1.000), (23,1.000), (24,1.000), (25,1.000), (26,1.000), 
3 Cy

IndexError: index 6 is out of bounds for axis 1 with size 6

## Step 3: Sanity check that outcomes have been correctly captured

In [13]:
df = data_files['PER_PATIENT']
df = df[['PUBLIC_ID','D_PT_lstalive', 'D_PT_lastdy', 'D_PT_deathdy']]
df = df[df['PUBLIC_ID'].isin(pids.ravel().tolist())].reset_index(drop=True)
df.loc[:,'D_PT_lastdy'] = df.D_PT_lastdy/30.
df.loc[:,'D_PT_deathdy'] = df.D_PT_deathdy/30.
df.loc[:,'D_PT_lstalive'] = df.D_PT_lstalive/30.
df

Unnamed: 0,PUBLIC_ID,D_PT_lstalive,D_PT_lastdy,D_PT_deathdy
0,MMRF_1016,21.566667,23.066667,23.066667
1,MMRF_1260,35.266667,36.666667,36.666667
2,MMRF_1090,17.533333,22.7,22.7


In [14]:
pids.ravel()

array(['MMRF_1260', 'MMRF_1016', 'MMRF_1090'], dtype=object)

In [15]:
for IDX in range(3):
    pname= pids[IDX]
    pidx = p_idx[IDX]
    print ('Patient ',dset[1]['train']['pids'][pidx])
    print (dset[1]['train']['ys_seq'][pidx])
    print (dset[1]['train']['ce'][pidx])

Patient  MMRF_1260
[36.666666666666664]
[0.0]
Patient  MMRF_1016
[23.066666666666666]
[0.0]
Patient  MMRF_1090
[22.7]
[0.0]


In [16]:
if not os.path.exists('mmrf_csv'):
    os.mkdir('mmrf_csv')
dset = load_mmrf(fold_span = range(5))

loading from: /afs/csail.mit.edu/group/clinicalml/users/rahulgk/ml_mmrf/ml_mmrf_v1/cleaned_mm_fold.pkl


In [5]:
trt_df = data_files['STAND_ALONE_TRTRESP']
temp  = trt_df[(trt_df['line'] == 1) & (trt_df['trtstdy'] == trt_df['therstdy']) & (trt_df['bestrespsh'].notna())]
bresp = temp[['public_id', 'trtshnm', 'bestrespsh']]
print(bresp)

      public_id        trtshnm bestrespsh
0     MMRF_1014        Bor-Dex         PR
14    MMRF_1017        Len-Dex       VGPR
15    MMRF_1024  Thal-Mel-Pred         SD
19    MMRF_1038        Len-Dex         PR
20    MMRF_1033        Len-Dex         PR
...         ...            ...        ...
3249  MMRF_2843    Car-Len-Dex         PR
3252  MMRF_2847    Car-Len-Dex         PR
3254  MMRF_2848    Car-Len-Dex       VGPR
3256  MMRF_2851    Car-Cyc-Dex       VGPR
3257  MMRF_2853    Car-Len-Dex       VGPR

[1074 rows x 3 columns]


In [6]:
bresp[(bresp['public_id'] == pids[0]) | (bresp['public_id'] == pids[1]) | (bresp['public_id'] == pids[2])]
# bresp[(bresp['bestrespsh'] == 'PD')]

Unnamed: 0,public_id,trtshnm,bestrespsh
172,MMRF_1257,Bor-Cyc-Dex,VGPR
823,MMRF_2144,Bor-Cyc-Dex,VGPR
1619,MMRF_1078,Bor-Len-Dex,CR


In [None]:
bresp['public_id'] == 'MMRF_2144'