In [1]:
import os, sys, glob
import pandas as pd
import numpy as np
import pickle
import warnings
from process_mmrf import get_sequential_tensor, merge_on_pids, parse_baseline, parse_outcomes, parse_treatments, parse_labs, parse_trt_outcomes
from fancyimpute import KNN as KNN_impute

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
ia_version = 'ia15'
if ia_version == 'ia13':
    FDIR  = '/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia13/CoMMpass_IA13_FlatFiles'
elif ia_version == 'ia15': 
    FDIR  = '/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles'
else:
    raise ValueError('Bad ia version')

In [3]:
data_files = {}
for fullname in glob.glob(FDIR+'/*.csv'):
    print (fullname)
    fname = os.path.basename(fullname).split('.')[0]
    if 'MMRF_CoMMpass_IA15_' in fname:
        kname = fname.split('MMRF_CoMMpass_IA15_')[1]
    else:
        kname = fname
    data_files[kname] = pd.read_csv(fullname, delimiter=',', encoding='latin-1')
print (data_files.keys())

/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles/MMRF_CoMMpass_IA15_STAND_ALONE_EMERGENCY_DEPT.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles/MMRF_CoMMpass_IA15_PER_PATIENT_VISIT.csv


  interactivity=interactivity, compiler=compiler, result=result)


/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles/MMRF_CoMMpass_IA15_STAND_ALONE_TRTRESP.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles/MMRF_CoMMpass_IA15_STAND_ALONE_TREATMENT_REGIMEN.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles/MMRF_CoMMpass_IA15_STAND_ALONE_AE.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles/MMRF_CoMMpass_IA15_STAND_ALONE_FAMHX.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles/MMRF_CoMMpass_IA15_STAND_ALONE_MEDHX.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles/MMRF_CoMMpass_IA15_STAND_ALONE_ADMISSIONS.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_myeloma/ia15/CoMMpass_IA15_FlatFiles/MMRF_CoMMpass_IA15_STAND_ALONE_SURVIVAL.csv
/afs/csail.mit.edu/group/clinicalml/datasets/multiple_mye

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
dataset = {}
dataset['treatment'] = {}
dataset['labs']      = {}
dataset['baseline']  = {}
dataset['outcomes']  = {}
dataset['trt_outcomes'] = {}

dataset_2mos = {}
dataset_2mos['treatment'] = {}
dataset_2mos['labs']      = {}
dataset_2mos['baseline']  = {}
dataset_2mos['outcomes']  = {}
dataset_2mos['trt_outcomes'] = {}

In [5]:
# # df_ppv        = data_files['PER_PATIENT_VISIT']
# # baseline_labs = ['D_LAB_cbc_abs_neut', 'D_LAB_chem_albumin', 'D_LAB_chem_bun', 'D_LAB_chem_calcium', 'D_LAB_chem_creatinine',
# #         'D_LAB_chem_glucose', 'D_LAB_cbc_hemoglobin', 'D_LAB_serum_kappa', 'D_LAB_serum_m_protein', 'D_LAB_cbc_platelet',
# #         'D_LAB_chem_totprot', 'D_LAB_cbc_wbc', 'D_LAB_serum_iga', 'D_LAB_serum_igg', 'D_LAB_serum_igm', 'D_LAB_serum_beta2_microglobulin',
# #         'D_LAB_serum_lambda', '']

# trt_df = data_files['STAND_ALONE_TRTRESP']
# # temp  = trt_df[(trt_df['line'] == 2) & (trt_df['trtstdy'] == trt_df['therstdy']) & (trt_df['bestrespsh'].notna())]
# temp  = trt_df[(trt_df['line'] == 2) & (trt_df['trtstdy'] == trt_df['therstdy'])]
# # bresp = temp[['public_id', 'trtshnm', 'bestrespsh']]
# # print(bresp)
# print(temp)

df_pp = data_files['PER_PATIENT_VISIT']
lab_names = ['D_LAB_serum_kappa', 'D_LAB_serum_m_protein', 'D_LAB_serum_iga', \
             'D_LAB_serum_igg', 'D_LAB_serum_igm','D_LAB_serum_lambda', 'D_LAB_urine_24hr_m_protein']
pd_names = ['AT_SERUMMCOMPONE', 'AT_URINEMCOMPONE', 'AT_ONLYINPATIENT', 'AT_ONLYINPATIENT2', 'AT_DEVELOPMENTOF']
df = df_pp[['PUBLIC_ID','VISIT','VISITDY']+lab_names+pd_names]
df.rename(columns = dict([(k,k.split('D_LAB_')[1]) for k in lab_names]), inplace=True)
df.rename(columns = dict([(k,k.split('AT_')[1]) for k in pd_names]), inplace=True)
lab_names = np.array([k.split('D_LAB_')[1] for k in lab_names])
pd_names  = np.array([k.split('AT_')[1] for k in pd_names])

# truncate based on the median
medians = df[lab_names].median(0)
maxval  = (5*(1+medians))
clipped = df[lab_names].clip(upper = maxval, axis=1)
print ('parse_labs: clipped values to 5x median (before/after)\n',pd.concat([df[lab_names].max(0), clipped[lab_names].max(0)],axis=1))
df.loc[:,lab_names] = clipped
for pd_name in pd_names: 
    print(pd_name)
    df[pd_name][df[pd_name] == 'Checked'] = 1
    df[pd_name][df[pd_name].isna()] = 0
print(df)
print(df[(df['SERUMMCOMPONE'] == 1) | \
            (df['URINEMCOMPONE'] == 1) | (df['ONLYINPATIENT'] == 1) | (df['ONLYINPATIENT2'] == 1) | \
            (df['DEVELOPMENTOF'] == 1)])

parse_labs: clipped values to 5x median (before/after)
                               0      1
serum_kappa           455000.00  14.66
serum_m_protein           40.30   5.65
serum_iga                276.63   9.00
serum_igg                147.00  49.05
serum_igm                 84.20   6.40
serum_lambda           46200.00  11.35
urine_24hr_m_protein     136.00   5.00
SERUMMCOMPONE
URINEMCOMPONE
ONLYINPATIENT
ONLYINPATIENT2
DEVELOPMENTOF
       PUBLIC_ID  VISIT  VISITDY  serum_kappa  serum_m_protein  serum_iga  \
0      MMRF_1014   -1.0    -10.0          NaN              NaN        NaN   
1      MMRF_1014    0.0    -10.0        14.66              2.2        9.0   
2      MMRF_1014    1.0     86.0          NaN              NaN        9.0   
3      MMRF_1014    2.0    173.0          NaN              1.4        9.0   
4      MMRF_1014    3.0    302.0          NaN              1.4        9.0   
...          ...    ...      ...          ...              ...        ...   
16167  MMRF_2853    2.

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, v)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(~key, value, inplace=True)
A value is trying to be set on a copy of a 

In [5]:
df_ppv        = data_files['PER_PATIENT_VISIT']
baseline_labs = ['D_LAB_cbc_abs_neut', 'D_LAB_chem_albumin', 'D_LAB_chem_bun', 'D_LAB_chem_calcium', 'D_LAB_chem_creatinine',
        'D_LAB_chem_glucose', 'D_LAB_cbc_hemoglobin', 'D_LAB_serum_kappa', 'D_LAB_serum_m_protein', 'D_LAB_cbc_platelet',
        'D_LAB_chem_totprot', 'D_LAB_cbc_wbc', 'D_LAB_serum_iga', 'D_LAB_serum_igg', 'D_LAB_serum_igm', 'D_LAB_serum_beta2_microglobulin',
        'D_LAB_serum_lambda']
df_bl = df_ppv[['PUBLIC_ID','VISIT','VISITDY']+baseline_labs]
df_bl.rename(columns = dict([(k,k.split('D_LAB_')[1]) for k in baseline_labs]), inplace=True)
baseline_labs = np.array([k.split('D_LAB_')[1] for k in baseline_labs])

df_bl = df_bl[df_bl.VISIT==0].reset_index(drop=True)
df_bl = df_bl.groupby('PUBLIC_ID').mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [7]:
# add heavy chain/light chain feature, and specific myeloma type vector 
serum_labs = ['serum_m_protein', 'serum_iga', 'serum_igg', 'serum_igm', 'serum_lambda', 'serum_kappa']
df_test = df_bl[serum_labs]
df_test['serum_igg'] = df_test['serum_igg'] * 100.
df_test['serum_iga'] = df_test['serum_iga'] * 100.
df_test['serum_igm'] = df_test['serum_igm'] * 100. 
df_test['kl_ratio']  = df_test['serum_kappa'] / df_test['serum_lambda'] 

# 1 if heavy chain, 0 if light chain 
new_df = df_test['serum_m_protein']>0.5
new_df[df_test['serum_m_protein'].isnull()] = np.NaN
df_test['heavy_chain'] = new_df

print((df_test['heavy_chain'] == 1.).sum())
print((df_test['heavy_chain'] == 0.).sum())

914
187


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: htt

In [7]:
medians = df_bl[baseline_labs].median(0)
maxval  = (15*(1+medians))
clipped = df_bl[baseline_labs].clip(upper = maxval, axis=1)
print ('parse_labs: clipped values to 5x median (before/after)\n',pd.concat([df_bl[baseline_labs].max(0), clipped[baseline_labs].max(0)],axis=1))
df_bl.loc[:,baseline_labs] = clipped

parse_labs: clipped values to 5x median (before/after)
                                     0            1
cbc_abs_neut                   16.512    16.512000
chem_albumin                   54.000    54.000000
chem_bun                       52.122    52.122000
chem_calcium                    4.250     4.250000
chem_creatinine              1986.348  1354.259987
chem_glucose                   16.940    16.940000
cbc_hemoglobin                  9.982     9.982000
serum_kappa                455000.000   166.500000
serum_m_protein                12.270    12.270000
cbc_platelet                  668.000   668.000000
chem_totprot                   17.100    17.100000
cbc_wbc                        34.600    34.600000
serum_iga                     276.630    21.450000
serum_igg                     147.000   147.000000
serum_igm                      84.200    18.000000
serum_beta2_microglobulin      37.900    37.900000
serum_lambda                46200.000    32.490000


## Treatments

In [5]:
tpids, tdata, tobs, tnames = parse_treatments(data_files['STAND_ALONE_TRTRESP'])
dataset['treatment']['pids'] = tpids; dataset['treatment']['data'] = tdata
dataset['treatment']['obs']  = tobs;  dataset['treatment']['names'] = tnames

tpids, tdata, tobs, tnames = parse_treatments(data_files['STAND_ALONE_TRTRESP'], granularity = 60, maxT=33)
dataset_2mos['treatment']['pids'] = tpids; dataset_2mos['treatment']['data'] = tdata
dataset_2mos['treatment']['obs']  = tobs;  dataset_2mos['treatment']['names'] = tnames

parse_treatments: treatments:  ['Bor' 'Car' 'Cyc' 'Dex' 'Len']
parse_treatments: adding line of therapy:  ['Bor' 'Car' 'Cyc' 'Dex' 'Len' 'line']
parse_treatments:processing... 0 Bor
	tget_sequential_tensor: feature name/values: Bor [1 0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)
parse_treatments:processing... 1 Car
	tget_sequential_tensor: feature name/values: Car [0 1]




	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)
parse_treatments:processing... 2 Cyc
	tget_sequential_tensor: feature name/values: Cyc [0 1]




	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)
parse_treatments:processing... 3 Dex
	tget_sequential_tensor: feature name/values: Dex [1 0]




	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)
parse_treatments:processing... 4 Len
	tget_sequential_tensor: feature name/values: Len [0 1]




	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)
parse_treatments:processing... 5 line
	tget_sequential_tensor: feature name/values: line [1 2 3 4 5 9]
	tget_sequential_tensor: did not hit second line for MMRF_1007




	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)
merge_on_pids: intersection of patient ids is 1143
merge_on_pids: after merging, pat_ids, data, obs: 1143 (1143, 66, 6) (1143, 66, 6)
parse_treatments: 1143 (1143, 66, 6) (1143, 66, 6)
parse_treatments: treatments:  ['Bor' 'Car' 'Cyc' 'Dex' 'Len']
parse_treatments: adding line of therapy:  ['Bor' 'Car' 'Cyc' 'Dex' 'Len' 'line']
parse_treatments:processing... 0 Bor
	tget_sequential_tensor: feature name/values: Bor [1 0]
	tget_sequential_tensor: output shapes: (1143,) (1143, 33) (1143, 33)
parse_treatments:processing... 1 Car
	tget_sequential_tensor: feature name/values: Car [0 1]
	tget_sequential_tensor: output shapes: (1143,) (1143, 33) (1143, 33)
parse_treatments:processing... 2 Cyc
	tget_sequential_tensor: feature name/values: Cyc [0 1]
	tget_sequential_tensor: output shapes: (1143,) (1143, 33) (1143, 33)
parse_treatments:processing... 3 Dex
	tget_sequential_tensor: feature name/values: Dex [1 0]
	tget_sequential_t

## Lab Values

In [6]:
lpids, ldata, lobs, lnames = parse_labs(data_files['PER_PATIENT_VISIT'])
dataset['labs']['pids'] = lpids; dataset['labs']['data']  = ldata
dataset['labs']['obs']  = lobs;  dataset['labs']['names'] = lnames

lpids, ldata, lobs, lnames = parse_labs(data_files['PER_PATIENT_VISIT'], granularity = 60, maxT=33, add_kl_ratio=False, add_pd_feats=False)
dataset_2mos['labs']['pids'] = lpids; dataset_2mos['labs']['data']  = ldata
dataset_2mos['labs']['obs']  = lobs;  dataset_2mos['labs']['names'] = lnames

parse_labs: clipped values to 5x median (before/after)
                           0         1
cbc_abs_neut         75.000   19.5000
chem_albumin         55.000   55.0000
chem_bun             52.122   33.5600
chem_calcium         10.300   10.3000
chem_creatinine    1986.348  433.7400
chem_glucose         36.245   32.7750
cbc_hemoglobin       11.656   11.6560
serum_kappa      455000.000   14.6500
serum_m_protein      40.300    5.5000
cbc_platelet        732.000  732.0000
chem_totprot         17.100   17.1000
cbc_wbc              81.000   29.5000
serum_iga           276.630    9.1500
serum_igg           147.000   48.9000
serum_igm            84.200    6.3875
serum_lambda      46200.000   11.4500

parse_labs:processing... 0 cbc_abs_neut


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, v)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pyd

	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)

parse_labs:processing... 1 chem_albumin
	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)

parse_labs:processing... 2 chem_bun
	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)

parse_labs:processing... 3 chem_calcium
	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)

parse_labs:processing... 4 chem_creatinine
	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)

parse_labs:processing... 5 chem_glucose
	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)

parse_labs:processing... 6 cbc_hemoglobin
	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)

parse_labs:processing... 7 serum_kappa
	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)

parse_labs:processing... 8 serum_m_protein
	tget_sequential_tensor: output shapes: (1143,) (1143, 66) (1143, 66)

parse_labs:processing... 9 cbc_platele

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, v)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pyd

	tget_sequential_tensor: output shapes: (1143,) (1143, 33) (1143, 33)

parse_labs:processing... 1 chem_albumin
	tget_sequential_tensor: output shapes: (1143,) (1143, 33) (1143, 33)

parse_labs:processing... 2 chem_bun
	tget_sequential_tensor: output shapes: (1143,) (1143, 33) (1143, 33)

parse_labs:processing... 3 chem_calcium
	tget_sequential_tensor: output shapes: (1143,) (1143, 33) (1143, 33)

parse_labs:processing... 4 chem_creatinine
	tget_sequential_tensor: output shapes: (1143,) (1143, 33) (1143, 33)

parse_labs:processing... 5 chem_glucose
	tget_sequential_tensor: output shapes: (1143,) (1143, 33) (1143, 33)

parse_labs:processing... 6 cbc_hemoglobin
	tget_sequential_tensor: output shapes: (1143,) (1143, 33) (1143, 33)

parse_labs:processing... 7 serum_kappa
	tget_sequential_tensor: output shapes: (1143,) (1143, 33) (1143, 33)

parse_labs:processing... 8 serum_m_protein
	tget_sequential_tensor: output shapes: (1143,) (1143, 33) (1143, 33)

parse_labs:processing... 9 cbc_platele

## Baseline Features

In [7]:
bpids, bdata, bnames = parse_baseline(data_files['PER_PATIENT'], data_files['PER_PATIENT_VISIT'], ia_version = ia_version)
dataset['baseline']['pids']  = bpids
dataset['baseline']['data']  = bdata
dataset['baseline']['obs']   = np.ones_like(bdata)
dataset['baseline']['names'] = bnames

bpids, bdata, bnames = parse_baseline(data_files['PER_PATIENT'], data_files['PER_PATIENT_VISIT'], ia_version = ia_version)
dataset_2mos['baseline']['pids']  = bpids
dataset_2mos['baseline']['data']  = bdata
dataset_2mos['baseline']['obs']   = np.ones_like(bdata)
dataset_2mos['baseline']['names'] = bnames

parse_baselines: clipped values to 5x median (before/after)
                               0     1
serum_beta2_microglobulin  37.9  22.4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


parse_baselines: do mean imputation on missing data in baseline
parse_baselines: doing knn(k=5) imputation for missing genomic data
Imputing row 1/1143 with 5 missing, elapsed time: 0.282
Imputing row 101/1143 with 0 missing, elapsed time: 0.290
Imputing row 201/1143 with 0 missing, elapsed time: 0.296
Imputing row 301/1143 with 5 missing, elapsed time: 0.299
Imputing row 401/1143 with 0 missing, elapsed time: 0.303
Imputing row 501/1143 with 0 missing, elapsed time: 0.307
Imputing row 601/1143 with 0 missing, elapsed time: 0.310
Imputing row 701/1143 with 5 missing, elapsed time: 0.315
Imputing row 801/1143 with 0 missing, elapsed time: 0.319
Imputing row 901/1143 with 5 missing, elapsed time: 0.323
Imputing row 1001/1143 with 5 missing, elapsed time: 0.325
Imputing row 1101/1143 with 5 missing, elapsed time: 0.328
parse_baselines: result (1143, 17)
parse_baselines: clipped values to 5x median (before/after)
                               0     1
serum_beta2_microglobulin  37.9  22.4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


parse_baselines: do mean imputation on missing data in baseline
parse_baselines: doing knn(k=5) imputation for missing genomic data
Imputing row 1/1143 with 5 missing, elapsed time: 0.191
Imputing row 101/1143 with 0 missing, elapsed time: 0.197
Imputing row 201/1143 with 0 missing, elapsed time: 0.201
Imputing row 301/1143 with 5 missing, elapsed time: 0.203
Imputing row 401/1143 with 0 missing, elapsed time: 0.206
Imputing row 501/1143 with 0 missing, elapsed time: 0.209
Imputing row 601/1143 with 0 missing, elapsed time: 0.212
Imputing row 701/1143 with 5 missing, elapsed time: 0.215
Imputing row 801/1143 with 0 missing, elapsed time: 0.218
Imputing row 901/1143 with 5 missing, elapsed time: 0.221
Imputing row 1001/1143 with 5 missing, elapsed time: 0.223
Imputing row 1101/1143 with 5 missing, elapsed time: 0.226
parse_baselines: result (1143, 17)


## Outcomes
* We will use time-to-death as outcome

In [8]:
ypid, Y, E = parse_outcomes(data_files['PER_PATIENT'])
dataset['outcomes']['pids'] = ypid
dataset['outcomes']['data'] = Y
dataset['outcomes']['obs']  = E
dataset['outcomes']['names']  = np.array(['mortality'])

ypid, Y, E = parse_outcomes(data_files['PER_PATIENT'], granularity = 60)
dataset_2mos['outcomes']['pids'] = ypid
dataset_2mos['outcomes']['data'] = Y
dataset_2mos['outcomes']['obs']  = E
dataset_2mos['outcomes']['names']  = np.array(['mortality'])

ypid_trt, Ytrt, E, tr_names = parse_trt_outcomes(data_files['STAND_ALONE_TRTRESP'], data_files['PER_PATIENT_VISIT'], line=2)
dataset['trt_outcomes']['pids'] = ypid_trt
dataset['trt_outcomes']['data'] = Ytrt 
dataset['trt_outcomes']['obs']  = E
dataset['trt_outcomes']['names'] = tr_names

dataset_2mos['trt_outcomes']['pids'] = ypid_trt
dataset_2mos['trt_outcomes']['data'] = Ytrt 
dataset_2mos['trt_outcomes']['obs']  = E
dataset_2mos['trt_outcomes']['names'] = tr_names
print(len(Ytrt))


parse_outcomes:  (1001,) (1001,) (1001,)
parse_outcomes:  (1001,) (1001,) (1001,)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


MMRF_1890 not in lists even though they have line 2 info.
379


## Save raw tensor dataset

In [9]:
with open('1_mmrf_dataset_type.pkl','wb') as f:
    pickle.dump(dataset, f)
with open('1_mmrf_dataset_2mos_type.pkl','wb') as f:
    pickle.dump(dataset_2mos, f)

In [10]:
# print(len(full.tolist()))
print(len(Ytrt))
print(len(np.where(Ytrt==0)[0]))


379
200
