In [1]:
import os, sys, glob
import pandas as pd
import numpy as np
import pickle
import warnings
from process_mmrf import get_sequential_tensor, merge_on_pids, parse_baseline, parse_outcomes, parse_treatments, parse_labs, parse_trt_outcomes
from fancyimpute import KNN as KNN_impute

Using TensorFlow backend.


In [2]:
ia_version = 'ia13'
if ia_version == 'ia13':
    FDIR  = '/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMMpass_IA13_FlatFiles'
else:
    raise ValueError('Bad ia version')

In [3]:
data_files = {}
for fullname in glob.glob(FDIR+'/*.csv'):
    print (fullname)
    fname = os.path.basename(fullname).split('.')[0]
    if 'MMRF_CoMMpass_IA13_' in fname:
        kname = fname.split('MMRF_CoMMpass_IA13_')[1]
    else:
        kname = fname
    data_files[kname] = pd.read_csv(fullname, delimiter=',', encoding='latin-1')
print (data_files.keys())

/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMMpass_IA13_FlatFiles/MMRF_CoMMpass_IA13_PER_PATIENT_VISIT.csv


  interactivity=interactivity, compiler=compiler, result=result)


/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMMpass_IA13_FlatFiles/MMRF_CoMMpass_IA13_STAND_ALONE_MEDHX.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMMpass_IA13_FlatFiles/MMRF_CoMMpass_IA13_STAND_ALONE_SURVIVAL.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMMpass_IA13_FlatFiles/MMRF_CoMMpass_IA13_STAND_ALONE_EMERGENCY_DEPT.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMMpass_IA13_FlatFiles/MMRF_CoMMpass_IA13_PER_PATIENT.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMMpass_IA13_FlatFiles/MMRF_CoMMpass_IA13_STAND_ALONE_TRTRESP.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMMpass_IA13_FlatFiles/MMRF_CoMMpass_IA13_STAND_ALONE_AE.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMMpass_IA13_FlatFiles/MMRF_CoMMpass_IA13_STAND_ALONE_FAMHX.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMM

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
dataset = {}
dataset['treatment'] = {}
dataset['labs']      = {}
dataset['baseline']  = {}
dataset['outcomes']  = {}
dataset['trt_outcomes'] = {}

dataset_2mos = {}
dataset_2mos['treatment'] = {}
dataset_2mos['labs']      = {}
dataset_2mos['baseline']  = {}
dataset_2mos['outcomes']  = {}
dataset_2mos['trt_outcomes'] = {}

In [5]:
df_ppv        = data_files['PER_PATIENT_VISIT']
baseline_labs = ['D_LAB_cbc_abs_neut', 'D_LAB_chem_albumin', 'D_LAB_chem_bun', 'D_LAB_chem_calcium', 'D_LAB_chem_creatinine',
        'D_LAB_chem_glucose', 'D_LAB_cbc_hemoglobin', 'D_LAB_serum_kappa', 'D_LAB_serum_m_protein', 'D_LAB_cbc_platelet',
        'D_LAB_chem_totprot', 'D_LAB_cbc_wbc', 'D_LAB_serum_iga', 'D_LAB_serum_igg', 'D_LAB_serum_igm', 'D_LAB_serum_beta2_microglobulin',
        'D_LAB_serum_lambda']
df_bl = df_ppv[['PUBLIC_ID','VISIT','VISITDY']+baseline_labs]
df_bl.rename(columns = dict([(k,k.split('D_LAB_')[1]) for k in baseline_labs]), inplace=True)
baseline_labs = np.array([k.split('D_LAB_')[1] for k in baseline_labs])

df_bl = df_bl[df_bl.VISIT==0].reset_index(drop=True)
df_bl = df_bl.groupby('PUBLIC_ID').mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [6]:
# add heavy chain/light chain feature, and specific myeloma type vector 
serum_labs = ['serum_m_protein', 'serum_iga', 'serum_igg', 'serum_igm', 'serum_lambda', 'serum_kappa']
df_test = df_bl[serum_labs]
df_test['serum_igg'] = df_test['serum_igg'] * 100.
df_test['serum_iga'] = df_test['serum_iga'] * 100.
df_test['serum_igm'] = df_test['serum_igm'] * 100. 
df_test['kl_ratio']  = df_test['serum_kappa'] / df_test['serum_lambda'] 

# 1 if heavy chain, 0 if light chain 
new_df = df_test['serum_m_protein']>0.5
new_df[df_test['serum_m_protein'].isnull()] = np.NaN
df_test['heavy_chain'] = new_df

print((df_test['heavy_chain'] == 1.).sum())
print((df_test['heavy_chain'] == 0.).sum())

914
188


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: htt

In [7]:
medians = df_bl[baseline_labs].median(0)
maxval  = (5*(1+medians))
clipped = df_bl[baseline_labs].clip(upper = maxval, axis=1)
print ('parse_labs: clipped values to 5x median (before/after)\n',pd.concat([df_bl[baseline_labs].max(0), clipped[baseline_labs].max(0)],axis=1))
df_bl.loc[:,baseline_labs] = clipped

parse_labs: clipped values to 5x median (before/after)
                                     0           1
cbc_abs_neut                   16.512   16.512000
chem_albumin                   54.000   54.000000
chem_bun                       52.122   38.915000
chem_calcium                    4.250    4.250000
chem_creatinine              1986.348  451.419996
chem_glucose                   16.940   16.940000
cbc_hemoglobin                  9.982    9.982000
serum_kappa                455000.000   55.500000
serum_m_protein                12.270   12.270000
cbc_platelet                  668.000  668.000000
chem_totprot                   17.100   17.100000
cbc_wbc                        34.600   34.500000
serum_iga                     276.630    7.150000
serum_igg                     147.000  130.175000
serum_igm                      84.200    6.000000
serum_beta2_microglobulin      37.900   22.400000
serum_lambda                46200.000   10.830000


## Treatments

In [8]:
tpids, tdata, tobs, tnames = parse_treatments(data_files['STAND_ALONE_TRTRESP'])
dataset['treatment']['pids'] = tpids; dataset['treatment']['data'] = tdata
dataset['treatment']['obs']  = tobs;  dataset['treatment']['names'] = tnames

tpids, tdata, tobs, tnames = parse_treatments(data_files['STAND_ALONE_TRTRESP'], granularity = 60, maxT=33)
dataset_2mos['treatment']['pids'] = tpids; dataset_2mos['treatment']['data'] = tdata
dataset_2mos['treatment']['obs']  = tobs;  dataset_2mos['treatment']['names'] = tnames

parse_treatments: treatments:  ['Bor' 'Car' 'Cyc' 'Dex' 'Len']
parse_treatments: adding line of therapy:  ['Bor' 'Car' 'Cyc' 'Dex' 'Len' 'line']
parse_treatments:processing... 0 Bor
	tget_sequential_tensor: feature name/values: Bor [1 0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)
parse_treatments:processing... 1 Car
	tget_sequential_tensor: feature name/values: Car [0 1]




	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)
parse_treatments:processing... 2 Cyc
	tget_sequential_tensor: feature name/values: Cyc [0 1]




	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)
parse_treatments:processing... 3 Dex
	tget_sequential_tensor: feature name/values: Dex [1 0]




	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)
parse_treatments:processing... 4 Len
	tget_sequential_tensor: feature name/values: Len [0 1]




	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)
parse_treatments:processing... 5 line
	tget_sequential_tensor: feature name/values: line [1 2 3 4 5 9]




	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)
merge_on_pids: intersection of patient ids is 1143
merge_on_pids: after merging, pat_ids, data, obs: 1143 (1143, 66, 6) (1143, 66, 6)
parse_treatments: 1143 (1143, 66, 6) (1143, 66, 6)
parse_treatments: treatments:  ['Bor' 'Car' 'Cyc' 'Dex' 'Len']
parse_treatments: adding line of therapy:  ['Bor' 'Car' 'Cyc' 'Dex' 'Len' 'line']
parse_treatments:processing... 0 Bor
	tget_sequential_tensor: feature name/values: Bor [1 0]
	tget_sequential_tensor: output shapes: (1143,) (1143, 33) (1143, 33)
parse_treatments:processing... 1 Car
	tget_sequential_tensor: feature name/values: Car [0 1]
	tget_sequential_tensor: output shapes: (1143,) (1143, 33) (1143, 33)
parse_treatments:processing... 2 Cyc
	tget_sequential_tensor: feature name/values: Cyc [0 1]
	tget_sequential_tensor: output shapes: (1143,) (1143, 33) (1143, 33)
parse_treatments:processing... 3 Dex
	tget_sequential_tensor: feature name/values: Dex [1 0]
	tget_sequential_t

## Lab Values

In [9]:
lpids, ldata, lobs, lnames = parse_labs(data_files['PER_PATIENT_VISIT'])
dataset['labs']['pids'] = lpids; dataset['labs']['data']  = ldata
dataset['labs']['obs']  = lobs;  dataset['labs']['names'] = lnames

lpids, ldata, lobs, lnames = parse_labs(data_files['PER_PATIENT_VISIT'], granularity = 60, maxT=33)
dataset_2mos['labs']['pids'] = lpids; dataset_2mos['labs']['data']  = ldata
dataset_2mos['labs']['obs']  = lobs;  dataset_2mos['labs']['names'] = lnames

parse_labs: clipped values to 5x median (before/after)
                           0        1
cbc_abs_neut         69.900   19.500
chem_albumin        140.000  140.000
chem_bun             52.122   33.560
chem_calcium          4.250    4.250
chem_creatinine    1986.348  429.320
chem_glucose         36.245   32.775
cbc_hemoglobin       11.656   11.656
serum_kappa      455000.000   14.660
serum_m_protein      40.300    5.650
cbc_platelet        732.000  732.000
chem_totprot         17.100   17.100
cbc_wbc              81.000   29.500
serum_iga           276.630    9.000
serum_igg           147.000   49.050
serum_igm            84.200    6.400
serum_lambda      46200.000   11.350

parse_labs:processing... 0 cbc_abs_neut
	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)

parse_labs:processing... 1 chem_albumin
	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)

parse_labs:processing... 2 chem_bun
	tget_sequential_tensor: output shapes: (1143,) (1143, 6

## Baseline Features

In [10]:
bpids, bdata, bnames = parse_baseline(data_files['PER_PATIENT'], data_files['PER_PATIENT_VISIT'], ia_version = ia_version)
dataset['baseline']['pids']  = bpids
dataset['baseline']['data']  = bdata
dataset['baseline']['obs']   = np.ones_like(bdata)
dataset['baseline']['names'] = bnames

bpids, bdata, bnames = parse_baseline(data_files['PER_PATIENT'], data_files['PER_PATIENT_VISIT'], ia_version = ia_version)
dataset_2mos['baseline']['pids']  = bpids
dataset_2mos['baseline']['data']  = bdata
dataset_2mos['baseline']['obs']   = np.ones_like(bdata)
dataset_2mos['baseline']['names'] = bnames

parse_baselines: clipped values to 5x median (before/after)
                               0     1
serum_beta2_microglobulin  37.9  22.4
parse_baselines: do mean imputation on missing data in baseline
parse_baselines: doing knn(k=5) imputation for missing genomic data
Imputing row 1/1143 with 5 missing, elapsed time: 0.332
Imputing row 101/1143 with 0 missing, elapsed time: 0.340
Imputing row 201/1143 with 0 missing, elapsed time: 0.346
Imputing row 301/1143 with 5 missing, elapsed time: 0.349
Imputing row 401/1143 with 0 missing, elapsed time: 0.352
Imputing row 501/1143 with 0 missing, elapsed time: 0.356
Imputing row 601/1143 with 0 missing, elapsed time: 0.359
Imputing row 701/1143 with 5 missing, elapsed time: 0.363
Imputing row 801/1143 with 0 missing, elapsed time: 0.367
Imputing row 901/1143 with 5 missing, elapsed time: 0.370
Imputing row 1001/1143 with 5 missing, elapsed time: 0.373
Imputing row 1101/1143 with 5 missing, elapsed time: 0.376
parse_baselines: result (1143, 17)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


Imputing row 1/1143 with 5 missing, elapsed time: 0.290
Imputing row 101/1143 with 0 missing, elapsed time: 0.298
Imputing row 201/1143 with 0 missing, elapsed time: 0.306
Imputing row 301/1143 with 5 missing, elapsed time: 0.310
Imputing row 401/1143 with 0 missing, elapsed time: 0.316
Imputing row 501/1143 with 0 missing, elapsed time: 0.321
Imputing row 601/1143 with 0 missing, elapsed time: 0.326
Imputing row 701/1143 with 5 missing, elapsed time: 0.332
Imputing row 801/1143 with 0 missing, elapsed time: 0.338
Imputing row 901/1143 with 5 missing, elapsed time: 0.342
Imputing row 1001/1143 with 5 missing, elapsed time: 0.345
Imputing row 1101/1143 with 5 missing, elapsed time: 0.348
parse_baselines: result (1143, 17)


## Outcomes
* We will use time-to-death as outcome

In [11]:
ypid, Y, E = parse_outcomes(data_files['PER_PATIENT'])
dataset['outcomes']['pids'] = ypid
dataset['outcomes']['data'] = Y
dataset['outcomes']['obs']  = E
dataset['outcomes']['names']  = np.array(['mortality'])

ypid, Y, E = parse_outcomes(data_files['PER_PATIENT'], granularity = 60)
dataset_2mos['outcomes']['pids'] = ypid
dataset_2mos['outcomes']['data'] = Y
dataset_2mos['outcomes']['obs']  = E
dataset_2mos['outcomes']['names']  = np.array(['mortality'])

ypid_trt, Ytrt, tr_names = parse_trt_outcomes(data_files['STAND_ALONE_TRTRESP'])
dataset['trt_outcomes']['pids'] = ypid_trt
dataset['trt_outcomes']['data'] = Ytrt 
dataset['trt_outcomes']['obs']  = np.ones_like(Ytrt)
dataset['trt_outcomes']['names'] = tr_names

dataset_2mos['trt_outcomes']['pids'] = ypid_trt
dataset_2mos['trt_outcomes']['data'] = Ytrt 
dataset_2mos['trt_outcomes']['obs']  = np.ones_like(Ytrt)
dataset_2mos['trt_outcomes']['names'] = tr_names
print(Ytrt)

parse_outcomes:  (1001,) (1001,) (1001,)
parse_outcomes:  (1001,) (1001,) (1001,)
parse_outcomes:  (1074,) (1074,)
[0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0]


## Save raw tensor dataset

In [13]:
with open('1_mmrf_dataset_type.pkl','wb') as f:
    pickle.dump(dataset, f)
with open('1_mmrf_dataset_2mos_type.pkl','wb') as f:
    pickle.dump(dataset_2mos, f)