# met_site_data_creation_impact_scaled.py

This script will create cBioPortal timeline files for the `Diagnosis` tracks (Primary, Lymph Node, and Metastasis)

The class `cBioPortalDiagnosisTimeline` will leverage the organ site mapping that is used in the MSK-MET paper (20 sites of disease)




## Load Libraries

In [1]:
import sys  
sys.path.insert(0, '../mappings')
sys.path.insert(0, '../analysis')
sys.path.insert(0, '../')
sys.path.insert(0, '/mind_data/fongc2/cdm-utilities/minio_api/')
sys.path.insert(0, '/mind_data/fongc2/cdm-utilities/')
sys.path.insert(0, '/mind_data/fongc2/diagnosis_event_abstraction_icd/cbioportal/')


import os
import pandas as pd
from minio_api import MinioAPI
import constants_o_sites as const
from organ_mapping_analysis import OrganMappingAnalysisRND
from organ_mapping_rdn_processing import MetastaticSpreadMappingRND
from dx_cbioportal_timeline import cBioPortalDiagnosisTimeline


from dotenv import load_dotenv, find_dotenv, dotenv_values
from utils import set_debug_console, print_df_without_index, drop_cols, mrn_zero_pad, convert_to_int, read_minio_api_config


In [2]:
set_debug_console()

## Test object
This section tests the class that performs all of the tasks at `Minio Setup` and below

In [11]:
# Filenames (Minio objects) required to create "Diagnosis" timeline files for cBioPortal
fname_minio_env = '/mind_data/fongc2/cdm-utilities/minio_env.txt'
fname_cbio = 'cbioportal/mskimpact_ids.tsv'
fname_dx = 'diagnosis/table_diagnosis_clean.csv'
fname_demo = 'demographics/ddp_demographics.tsv'
fname_impact_map = 'summary/IMPACT_Darwin_Sample_Summary.tsv'
fname_rad_pred = 'radiology/aacr_predictions.csv'
config_cols = None

TBD on config_cols... There are way too many hardcoded column names and values in this class that maybe should be in some global file.

In [4]:
obj_dx_timeline = cBioPortalDiagnosisTimeline(fname_minio_config=fname_minio_env, 
                                              config_cols=config_cols, 
                                              const_organ_map=const, 
                                              fname_impact_map=fname_impact_map, 
                                              fname_demo=fname_demo, 
                                              fname_dx=fname_dx, 
                                              fname_cbio=fname_cbio, 
                                              fname_rad_pred=fname_rad_pred)

Loading mapping tables


In [5]:
df_prim = obj_dx_timeline.primary_dx_clean_and_merge(fname_save=None)

In [6]:
df_prim.head(50)

Unnamed: 0,PATIENT_ID,START_DATE,STOP_DATE,EVENT_TYPE,SUBTYPE,SOURCE,DX_DESCRIPTION,AJCC,CLINICAL_GROUP,PATH_GROUP
25499,P-0000004,13524,,Diagnosis,Primary,Tumor Registry,"INFILTRATING DUCT CARCINOMA | BREAST, NOS (M85...",N/A ...,,
71569,P-0000004,14484,,Diagnosis,Primary,ICD Billing,BREAST CA (174.9),,,
1606,P-0000012,11539,,Diagnosis,Primary,Tumor Registry,"INFILTRATING DUCT CARCINOMA | BREAST, NOS (M85...",N/A ...,99,99
4556,P-0000012,15726,,Diagnosis,Primary,ICD Billing,BREAST CA-CENTRAL (174.1),,,
4557,P-0000012,15726,,Diagnosis,Primary,ICD Billing,"BREAST CA, IN SITU (233.0)",,,
1607,P-0000012,21138,,Diagnosis,Primary,Tumor Registry,"ADENOCARCINOMA, NOS | LUNG, NOS (M8140/3 | C349)",IIIB ...,3B,99
4559,P-0000012,21213,,Diagnosis,Primary,ICD Billing,BRONCHUS/LUNG CA (162.8),,,
4560,P-0000012,21514,,Diagnosis,Primary,ICD Billing,BRONCHUS/LUNG CA (162.9),,,
4558,P-0000012,21591,,Diagnosis,Primary,ICD Billing,UTERINE LEIOMYOMA (218.9),,,
4561,P-0000012,22886,,Diagnosis,Primary,ICD Billing,"MALIGNANT NEOPLASM OF LOWER LOBE, LEFT BRONCHU...",,,


In [7]:
df_f = obj_dx_timeline.metastatic_clean_and_merge(fname_save='/mind_data/msk-mind-datahub/msk-mind/datahub/mskimpact_test_june/data_timeline_indication_of_mets1.txt')
df_f.head(10)

Saving /mind_data/msk-mind-datahub/msk-mind/datahub/mskimpact_test_june/data_timeline_indication_of_mets1.txt
Saved.


Unnamed: 0,PATIENT_ID,START_DATE,STOP_DATE,EVENT_TYPE,SUBTYPE,SOURCE,SOURCE_SPECIFIC,DX_DESCRIPTION,ANATOMIC_LOCATION
61164,P-0000004,14484,,Diagnosis,Metastasis,ICD Billing,ICD Billing,198.5 - METS - BONE METS,BONE
61165,P-0000004,14484,,Diagnosis,Metastasis,ICD Billing,ICD Billing,197.7 - METS - LIVER METS,LIVER
0,P-0000012,21180,,Diagnosis,Metastasis,MSK-IMPACT,Pathology,Neck,HEAD_AND_NECK
2957,P-0000012,21500,,Diagnosis,Metastasis,ICD Billing,ICD Billing,197.6 - METS - MET CA-PERITONEUM,INTRA_ABDOMINAL
2959,P-0000012,21874,,Diagnosis,Metastasis,ICD Billing,ICD Billing,C78.6 - Miscellaneous,INTRA_ABDOMINAL
660054,P-0000015,16143,,Diagnosis,Metastasis,Radiology Report,CT,NLP Derived,BONE
22091,P-0000015,16162,,Diagnosis,Metastasis,ICD Billing,ICD Billing,198.5 - METS - BONE METS,BONE
22094,P-0000015,16162,,Diagnosis,Metastasis,ICD Billing,ICD Billing,197.0 - METS - LUNG METS,LUNG
22098,P-0000015,16177,,Diagnosis,Metastasis,ICD Billing,ICD Billing,197.2 - METS - MET CA-PLEURA,PLEURA
22093,P-0000015,16190,,Diagnosis,Metastasis,ICD Billing,ICD Billing,197.7 - METS - LIVER METS,LIVER


In [8]:
df_f = obj_dx_timeline.lymph_node_clean_and_merge(fname_save='/mind_data/msk-mind-datahub/msk-mind/datahub/mskimpact_test_june/data_timeline_ln1.txt')
df_f.head(10)

Saving /mind_data/msk-mind-datahub/msk-mind/datahub/mskimpact_test_june/data_timeline_ln1.txt
Saved.


Unnamed: 0,PATIENT_ID,START_DATE,STOP_DATE,EVENT_TYPE,SUBTYPE,SOURCE,SOURCE_SPECIFIC,DX_DESCRIPTION,ANATOMIC_LOCATION
2956,P-0000012,21138,,Diagnosis,Lymph Nodes,ICD Billing,ICD Billing,196.0 - METS - LYMPH NODE METS-NECK,LYMPH
2955,P-0000012,21213,,Diagnosis,Lymph Nodes,ICD Billing,ICD Billing,196.1 - METS - LYMPH NODE METS-INTRATHOR,LYMPH
2958,P-0000012,21501,,Diagnosis,Lymph Nodes,ICD Billing,ICD Billing,198.89 - METS - MET CANCER,OTHER
22099,P-0000015,16177,,Diagnosis,Lymph Nodes,ICD Billing,ICD Billing,198.89 - METS - MET CANCER,OTHER
2310191,P-0000015,16221,,Diagnosis,Lymph Nodes,Radiology Report,PET,NLP Derived,LYMPH
22095,P-0000015,16226,,Diagnosis,Lymph Nodes,ICD Billing,ICD Billing,196.3 - METS - LYMPH NODE METS-AXILLA,LYMPH
22096,P-0000015,16263,,Diagnosis,Lymph Nodes,ICD Billing,ICD Billing,196.1 - METS - LYMPH NODE METS-INTRATHOR,LYMPH
2310196,P-0000015,16303,,Diagnosis,Lymph Nodes,Radiology Report,PET,NLP Derived,LYMPH
2310197,P-0000015,16303,,Diagnosis,Lymph Nodes,Radiology Report,CT,NLP Derived,LYMPH
48535,P-0000023,22227,,Diagnosis,Lymph Nodes,ICD Billing,ICD Billing,196.2 - METS - LYMPH NODE METS-ABD,LYMPH


## Minio Setup

In [3]:
# env = load_dotenv('/mind_data/fongc2/cdm-utilities/minio_env.txt')
config = read_minio_api_config('/mind_data/fongc2/cdm-utilities/minio_env.txt')
dict_config = dict(config)

In [4]:
dict_config['MINIO_ENV']

'/mind_data/fongc2/.env'

In [5]:
load_dotenv(find_dotenv())
 
ACCESS_KEY = os.getenv("ACCESS_KEY")
SECRET_KEY = os.getenv('SECRET_KEY')
BUCKET = 'cdm-data'

ca_certs = '/mind_data/fongc2/certificate.crt'
url_port = "tllihpcmind6:9000"

In [6]:
obj_minio = MinioAPI(ACCESS_KEY=ACCESS_KEY, SECRET_KEY=SECRET_KEY, ca_certs=ca_certs, url_port=url_port)

In [7]:
obj_minio.print_list_objects(bucket_name=BUCKET, recursive=True, prefix="radio")

radiology/.ipynb_checkpoints/rad_emb_feat_imp-checkpoint.csv
radiology/.ipynb_checkpoints/radiology_features_embedding_avg_findings_all-checkpoint.csv
radiology/.ipynb_checkpoints/radiology_features_embedding_avg_findings_parsed-checkpoint.csv
radiology/.ipynb_checkpoints/radiology_features_embedding_avg_impressions-checkpoint.csv
radiology/aacr_predictions.csv
radiology/ddp_radiology_reports.tsv
radiology/ddp_radiology_reports.tsv.bkup
radiology/ddp_radiology_reports_char_limit.tsv
radiology/ddp_radiology_reports_full_parsed.tsv
radiology/feature_matrices/.ipynb_checkpoints/
radiology/feature_matrices/genie_lung_crc_train/.ipynb_checkpoints/genie_crc_lung_training_0604-checkpoint.csv
radiology/feature_matrices/genie_lung_crc_train/genie_crc_lung_train_adrenal_feature_matrix_0614.csv
radiology/feature_matrices/genie_lung_crc_train/genie_crc_lung_train_bone_feature_matrix_0614.csv
radiology/feature_matrices/genie_lung_crc_train/genie_crc_lung_train_cnsbrain_feature_matrix_0614.csv
radio

In [None]:
df_test = obj_minio.load_obj(bucket_name=BUCKET, 
                               path_object='sandbox/fongc2/dx_met_timeline.txt', 
                               sep='\t')
df_test.head()

## Outputs

This script will create a cBioPortal formatted timeline file for metastatic site events
Events will be derived from four sources:
- Billing codes
- IMPACT sample info
- ICD-O registry
- Radiology reports (NLP predictions)

cBioPortal timeline file columns required:
- PATIENT_ID
- START_DATE
- STOP_DATE (For this file, can be left blank)
- EVENT_TYPE (Always "Diagnosis")
- SUBTYPE (For metastatic events, always "Metastasis")
- SOURCE (ICD Billing, MSK-IMPACT, Radiology Report)
- SOURCE_SPECIFIC (Imaging modality: CT, PET, MRI for radiology reports. Pathology for MSK-IMPACT. ICD Billing for billing codes.)
- DX_DESCRIPTION (ICD billing description, "Radiology Report", Pathology report specimen description (?))
- ANATOMIC_LOCATION (MSK-MET derived info)


In [75]:
col_order = ['PATIENT_ID', 
              'START_DATE', 
              'STOP_DATE',
              'EVENT_TYPE',
              'SUBTYPE',
              'SOURCE',
              'SOURCE_SPECIFIC',
              'DX_DESCRIPTION',
              'ANATOMIC_LOCATION']

col_order_prim = ['PATIENT_ID', 
                  'START_DATE', 
                  'STOP_DATE',
                  'EVENT_TYPE',
                  'SUBTYPE',
                  'SOURCE',
                  'DX_DESCRIPTION',
                  'AJCC',
                  'CLINICAL_GROUP', 
                  'PATH_GROUP']




### Filename for output

In [20]:
path = '../demo_data'
fname_save_anno = 'impact2017_met_site_annotations_impact_scaled.csv'
pathfilename_out = os.path.join(path, fname_save_anno)

## Load Data
- Demographics (for gender)
- ID mapping between dmp-id and mrn
- IMPACT sample data
- IMPACT sample summary (for DOP)
- Dx Timeline from Darwin
- Radiology report predictions

### Load cbioportal clinical data file

In [8]:
df_samples1 = obj_minio.load_obj(bucket_name=BUCKET, 
                               path_object='cbioportal/mskimpact_ids.tsv', 
                               sep='\t')

# For genie, fix ids
col_id = 'SAMPLE_ID'
col_id2 = 'DMP_ID'
col_sex = 'GENDER'
col_sample_type = 'SAMPLE_TYPE'
col_prim_site = 'PRIMARY_SITE'
col_met_site = 'METASTATIC_SITE'
col_icd_billing = 'ICD-9/10 Dx Code'


col_rep = {'patientId': col_id2,
           'sampleId': col_id}

cols_keep = ['SAMPLE_ID',
             'DMP_ID',
             'CANCER_TYPE',
             'SAMPLE_TYPE',
             'PRIMARY_SITE',
             'METASTATIC_SITE'
            ]

df_samples = df_samples1.rename(columns=col_rep)
# df_samples = df_samples1[list(col_rep.values())]
logic1 = df_samples[col_sample_type] == 'Metastasis'
df_samples = df_samples.loc[logic1, cols_keep]


In [9]:
df_samples1.head(1)


Unnamed: 0,sampleId,patientId,AGE_AT_SEQ_REPORTED_YEARS,ARCHER,CANCER_TYPE,CANCER_TYPE_DETAILED,CVR_TMB_COHORT_PERCENTILE,CVR_TMB_SCORE,CVR_TMB_TT_COHORT_PERCENTILE,DATE_ADDED,FRACTION_GENOME_ALTERED,GENE_PANEL,INSTITUTE,METASTATIC_SITE,MGMT_STATUS,MONTH_ADDED,MSI_COMMENT,MSI_SCORE,MSI_TYPE,MSK_SLIDE_ID,MUTATION_COUNT,ONCOTREE_CODE,PATH_SLIDE_EXISTS,PRIMARY_SITE,SAMPLE_CLASS,SAMPLE_COVERAGE,SAMPLE_TYPE,SOMATIC_STATUS,SO_COMMENTS,TUMOR_PURITY,WEEK_ADDED,WHO_GRADE
0,P-0000004-T01-IM3,P-0000004,40,NO,Breast Cancer,Breast Invasive Ductal Carcinoma,58.6,4.5,67.8,2015/04/07,0.2782,IMPACT341,MSKCC,,,2015/04,,2.5,Stable,,4,IDC,NO,Breast,Tumor,428.0,Primary,Matched,,50,"2015, Wk. 15",


### Load demographics

In [10]:
df_demo = obj_minio.load_obj(bucket_name=BUCKET, 
                               path_object='demographics/ddp_demographics.tsv', 
                               sep='\t')

df_demo = mrn_zero_pad(df=df_demo, col_mrn='MRN')
df_demo = df_demo[['MRN', 'PT_BIRTH_DTE', 'GENDER']]

df_demo['PT_BIRTH_DTE'] = pd.to_datetime(df_demo['PT_BIRTH_DTE'])



In [11]:
df_demo.head(1);


### Load sample summary data

In [12]:
df_sample_summary = obj_minio.load_obj(bucket_name=BUCKET, 
                               path_object='summary/IMPACT_Darwin_Sample_Summary.tsv', 
                               sep='\t')

df_sample_summary = convert_to_int(df=df_sample_summary, list_cols=['MRN'])
df_sample_summary = mrn_zero_pad(df=df_sample_summary, col_mrn='MRN')

cols_keep = ['MRN', 'SAMPLE_ID', 'DMP_ID', 'DATE_OF_PROCEDURE_SURGICAL_EST']
df_sample_summary = df_sample_summary[cols_keep]

df_sample_summary['DATE_OF_PROCEDURE_SURGICAL_EST'] = pd.to_datetime(df_sample_summary['DATE_OF_PROCEDURE_SURGICAL_EST'])

In [13]:
df_sample_summary;

### Load ID mapping

In [14]:
df_id_map = obj_minio.load_obj(bucket_name=BUCKET, 
                               path_object='id_mapping/ddp_id_mapping_pathology.tsv', 
                               sep='\t')

df_id_map = mrn_zero_pad(df=df_id_map, col_mrn='MRN')
df_id_map = df_id_map[['MRN', 'DMP_ID']].drop_duplicates()
df_id_map = df_id_map[df_id_map['DMP_ID'].notnull()]



In [15]:
df_id_map.head(1);


### Load Dx timeline data

In [112]:
df_dx_timeline = obj_minio.load_obj(bucket_name=BUCKET, 
                               path_object='diagnosis/table_diagnosis_clean.csv', 
                               sep=',')

df_dx_timeline = mrn_zero_pad(df=df_dx_timeline, col_mrn='MRN')

df_dx_timeline['Diagnosis Date'] = pd.to_datetime(df_dx_timeline['Diagnosis Date'])
df_dx_timeline = df_dx_timeline[df_dx_timeline['IS_NON_CANCER'] == False]


In [113]:
df_dx_timeline.head(1)

Unnamed: 0,MRN,ICD-9/10 Dx Code,Diagnosis,ICD-O Site Code,ICD-O Site Desc,ICD-O Histology Code,ICD-O Histology Desc,Clinical Group,Path Group,AJCC,Summary,MSK Stage,Diagnosis Type,CATEGORY1,CATEGORY2,CATEGORY3,CATEGORY4,Diagnosis Description,Source,Diagnosis Date,REASON_NO_DATA,DATA_AVAILABLE_DDP,IS_MET_ICD_BILLING,IS_MET_ICD_BILLING_OTHER,IS_MET_ICD_BILLING_LN,IS_MET_ICD_BILLING_NON_LN,IS_NON_CANCER,IS_INVALID_LABEL,ICDO_INDEXED_10YR,IS_ICDO_DX,IS_STAGE_IV_ICDO,IS_SOLID_TUMOR,IS_STAGE_IV_ICDO_SOLID,IS_STAGE_IV_ICDO_LIQUID,ICDO_IS_LOCAL,ICDO_IS_REGIONAL_GENERAL,ICDO_IS_DISTANT,ICDO_IS_IN_SITU,ICDO_IS_REGIONAL_LYMPH,ICDO_IS_REGIONAL_DIRECT_EXT,ICDO_IS_UNSTAGED,ICDO_IS_STAGE_NA,ICDO_IS_DISTANT_SOLID,ICDO_IS_DISTANT_LIQUID,IS_MALIGNANT_PRIMARY
0,94,C43.9,"MALIGNANT MELANOMA OF SKIN, UNSPECIFIED",,,,,,,,,,CA,Skin excluding Basal and Squamous,Melanoma of the Skin,Melanoma of the Skin,Melanoma of the Skin,Melanoma of the Skin,Billing,2016-09-19,,True,False,False,False,False,False,False,,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True


#### Tumor Registry (Add this to Primary Dx timeline)

In [114]:
cols_dx_treg = ['MRN', 'Diagnosis Date', 
           'IS_STAGE_IV_ICDO_SOLID', 'ICDO_IS_DISTANT_SOLID']
df_dx_treg = df_dx_timeline.loc[df_dx_timeline['Source'] == 'Tumor Registry']

dx_desc = df_dx_treg['ICD-O Histology Desc'] + ' | ' + df_dx_treg['ICD-O Site Desc']
source = df_dx_treg['ICD-O Histology Code'] + ' | ' + df_dx_treg['ICD-O Site Code']
df_dx_treg = df_dx_treg.assign(DX_DESCRIPTION=dx_desc)
df_dx_treg = df_dx_treg.assign(SOURCE=source)

cols_dict = {'Clinical Group': 'CLINICAL_GROUP',
             'Path Group': 'PATH_GROUP'}

cols_keep = ['MRN', 'Diagnosis Date', 'DX_DESCRIPTION', 'SOURCE', 'CLINICAL_GROUP', 'PATH_GROUP', 'AJCC']

df_dx_treg = df_dx_treg.rename(columns=cols_dict)
df_dx_treg = df_dx_treg[cols_keep]

# df_dx_treg_mets = df_dx_treg_mets.drop(columns=['IS_STAGE_IV_ICDO_SOLID', 'ICDO_IS_DISTANT_SOLID'])

In [115]:
df_dx_treg.head(1)


Unnamed: 0,MRN,Diagnosis Date,DX_DESCRIPTION,SOURCE,CLINICAL_GROUP,PATH_GROUP,AJCC
2,94,2016-08-26,"MALIGNANT MELANOMA, NOS | SKIN, NOS",M8720/3 | C449,3,99,N/A ...


#### Billing codes
##### Metastatic disease

In [116]:
cols_dx_billing = ['MRN', 'ICD-9/10 Dx Code', 'Diagnosis Description', 'Diagnosis Date']

logic1 = df_dx_timeline['Diagnosis Type'] == 'METS'
logic2 = df_dx_timeline['Source'] == 'Billing'
logics = logic1 & logic2
df_dx_billing = df_dx_timeline.loc[logics, cols_dx_billing]


In [117]:
df_dx_billing.head(1);


##### Primary dx

In [118]:
logic1 = df_dx_timeline['Diagnosis Type'] == 'CA'
logic2 = df_dx_timeline['Source'] == 'Billing'
cols_keeps = ['MRN', 'Diagnosis Date']

logics = logic1 & logic2
df_dx_timeline_prim = df_dx_timeline.loc[logics].copy()
dx_desc = df_dx_timeline_prim['Diagnosis'] 
source = df_dx_timeline_prim['ICD-9/10 Dx Code']
df_dx_timeline_prim = df_dx_timeline_prim[cols_keeps]

df_dx_timeline_prim = df_dx_timeline_prim.assign(DX_DESCRIPTION=dx_desc)
df_dx_timeline_prim = df_dx_timeline_prim.assign(SOURCE=source)





In [119]:
df_dx_timeline_prim.head(1)

Unnamed: 0,MRN,Diagnosis Date,DX_DESCRIPTION,SOURCE
0,94,2016-09-19,"MALIGNANT MELANOMA OF SKIN, UNSPECIFIED",C43.9


### Load Radiology report met predictions

In [23]:
df_rad_pred = obj_minio.load_obj(bucket_name=BUCKET, 
                               path_object=fname_rad_pred, 
                               sep=',')
df_rad_pred['RADIOLOGY_PERFORMED_DATE'] = pd.to_datetime(df_rad_pred['RADIOLOGY_PERFORMED_DATE'])
df_rad_pred = df_rad_pred.drop(columns=['ACCESSION_NUMBER'])


In [24]:
df_rad_pred.head()


Unnamed: 0,DMP_ID,RADIOLOGY_PERFORMED_DATE,PROCEDURE_TYPE,Adrenal_Glands,Biliary_Tract,Bone,Bowel,CNS_Brain,Liver,Lung,Lymph_Nodes,Mediastinum,Intra.Abdominal,Pleura,Reproductive_Organs
0,P-0000015,2013-10-09,CT,0,0,1,0,0,0,0,0,0,0,0,0
1,P-0000015,2013-12-16,CT,0,0,1,0,0,1,0,0,1,0,1,0
2,P-0000015,2013-12-26,PET,0,0,1,0,0,1,0,1,0,0,1,0
3,P-0000015,2014-01-02,MR,0,0,1,0,0,0,0,0,0,0,0,0
4,P-0000015,2014-01-03,MR,0,0,1,0,0,0,0,0,0,0,0,0


## Merge and clean input data
- Add date of birth and sex/gender to dataframes
- Add impact IDs
- Select important columns

### Sample data

In [37]:
df_samples_f = df_sample_summary.merge(right=df_samples, how='right', on=['SAMPLE_ID', 'DMP_ID'])
df_samples_f = df_samples_f.merge(right=df_demo, how='left', on='MRN')

age_dop = (df_samples_f['DATE_OF_PROCEDURE_SURGICAL_EST'] - df_samples_f['PT_BIRTH_DTE']).dt.days
df_samples_f = df_samples_f.assign(AGE_DX=age_dop)

df_samples_f = df_samples_f[df_samples_f['AGE_DX'].notnull()]
df_samples_f['AGE_DX'] = df_samples_f['AGE_DX'].astype(int) 
df_samples_f = df_samples_f.drop(columns=['MRN', 'DATE_OF_PROCEDURE_SURGICAL_EST', 'PT_BIRTH_DTE'])


In [38]:
print(df_samples_f.shape)
df_samples_f.head(1)


(25105, 8)


Unnamed: 0,SAMPLE_ID,DMP_ID,CANCER_TYPE,SAMPLE_TYPE,PRIMARY_SITE,METASTATIC_SITE,GENDER,AGE_DX
0,P-0000012-T03-IM3,P-0000012,Non-Small Cell Lung Cancer,Metastasis,Lung,Neck,FEMALE,21180


### Billing code data

#### Mets and LN

In [120]:
df_dx_billing_f = df_dx_billing.merge(right=df_demo, how='left', on='MRN')
df_id_map = df_sample_summary[['DMP_ID', 'MRN']].drop_duplicates()
df_dx_billing_f = df_id_map.merge(right=df_dx_billing_f, how='right', on='MRN')

age_dx = (df_dx_billing_f['Diagnosis Date'] - df_dx_billing_f['PT_BIRTH_DTE']).dt.days
df_dx_billing_f = df_dx_billing_f.assign(AGE_DX=age_dx)

df_dx_billing_f = df_dx_billing_f.drop(columns=['Diagnosis Date', 'PT_BIRTH_DTE', 'MRN'])
df_dx_billing_f = df_dx_billing_f[df_dx_billing_f['DMP_ID'].notnull()]

In [121]:
df_dx_billing_f.head(1)

Unnamed: 0,DMP_ID,ICD-9/10 Dx Code,Diagnosis Description,GENDER,AGE_DX
0,P-0015526,C77.0,Miscellaneous,MALE,17267


#### Primary dx

In [122]:
df_dx_billing_prim_f = df_dx_timeline_prim.merge(right=df_demo, how='left', on='MRN')
df_id_map = df_sample_summary[['DMP_ID', 'MRN']].drop_duplicates()
df_dx_billing_prim_f = df_id_map.merge(right=df_dx_billing_prim_f, how='right', on='MRN')

age_dx = (df_dx_billing_prim_f['Diagnosis Date'] - df_dx_billing_prim_f['PT_BIRTH_DTE']).dt.days
df_dx_billing_prim_f = df_dx_billing_prim_f.assign(START_DATE=age_dx)

df_dx_billing_prim_f = df_dx_billing_prim_f.drop(columns=['Diagnosis Date', 'PT_BIRTH_DTE', 'MRN'])
df_dx_billing_prim_f = df_dx_billing_prim_f[df_dx_billing_prim_f['DMP_ID'].notnull()]

df_dx_billing_prim_f = df_dx_billing_prim_f.rename(columns={'DMP_ID': 'PATIENT_ID'})


df_dx_billing_prim_f = df_dx_billing_prim_f.assign(STOP_DATE='')
df_dx_billing_prim_f = df_dx_billing_prim_f.assign(EVENT_TYPE='Diagnosis')
df_dx_billing_prim_f = df_dx_billing_prim_f.assign(SUBTYPE='ICD Billing')
df_dx_billing_prim_f = df_dx_billing_prim_f.assign(AJCC='')
df_dx_billing_prim_f = df_dx_billing_prim_f.assign(CLINICAL_GROUP='')
df_dx_billing_prim_f = df_dx_billing_prim_f.assign(PATH_GROUP='')

df_dx_billing_prim_f = df_dx_billing_prim_f[col_order_prim]


In [123]:
df_dx_billing_prim_f.head(1)

Unnamed: 0,PATIENT_ID,START_DATE,STOP_DATE,EVENT_TYPE,SUBTYPE,SOURCE,DX_DESCRIPTION,AJCC,CLINICAL_GROUP,PATH_GROUP
0,P-0015526,17267,,Diagnosis,ICD Billing,C43.9,"MALIGNANT MELANOMA OF SKIN, UNSPECIFIED",,,


### Tumor registry data

In [124]:
df_dx_treg_f = df_dx_treg.merge(right=df_demo, how='left', on='MRN')
df_id_map = df_sample_summary[['DMP_ID', 'MRN']].drop_duplicates()
df_dx_treg_f = df_id_map.merge(right=df_dx_treg_f, how='right', on='MRN')

age_dx = (df_dx_treg_f['Diagnosis Date'] - df_dx_treg_f['PT_BIRTH_DTE']).dt.days
df_dx_treg_f = df_dx_treg_f.drop(columns=['MRN', 'GENDER', 'Diagnosis Date', 'PT_BIRTH_DTE'])
df_dx_treg_f = df_dx_treg_f[df_dx_treg_f['DMP_ID'].notnull()]

df_dx_treg_f = df_dx_treg_f.rename(columns={'DMP_ID': 'PATIENT_ID'})

df_dx_treg_f = df_dx_treg_f.assign(START_DATE=age_dx)
df_dx_treg_f = df_dx_treg_f.assign(STOP_DATE='')
df_dx_treg_f = df_dx_treg_f.assign(EVENT_TYPE='Diagnosis')
df_dx_treg_f = df_dx_treg_f.assign(SUBTYPE='Tumor Registry')
df_dx_treg_f = df_dx_treg_f.assign(SOURCE='Tumor Registry')

df_dx_treg_f = df_dx_treg_f[col_order_prim]


In [125]:
df_dx_treg_f.head(1)

Unnamed: 0,PATIENT_ID,START_DATE,STOP_DATE,EVENT_TYPE,SUBTYPE,SOURCE,DX_DESCRIPTION,AJCC,CLINICAL_GROUP,PATH_GROUP
0,P-0015526,17243,,Diagnosis,Tumor Registry,Tumor Registry,"MALIGNANT MELANOMA, NOS | SKIN, NOS",N/A ...,3,99


In [127]:
pd.concat([df_dx_treg_f, df_dx_billing_prim_f], axis=0, sort=False).sort_values(by=['PATIENT_ID', 'START_DATE'])

Unnamed: 0,PATIENT_ID,START_DATE,STOP_DATE,EVENT_TYPE,SUBTYPE,SOURCE,DX_DESCRIPTION,AJCC,CLINICAL_GROUP,PATH_GROUP
25499,P-0000004,13524,,Diagnosis,Tumor Registry,Tumor Registry,"INFILTRATING DUCT CARCINOMA | BREAST, NOS",N/A ...,,
47612,P-0000004,14484,,Diagnosis,ICD Billing,174.9,BREAST CA,,,
1606,P-0000012,11539,,Diagnosis,Tumor Registry,Tumor Registry,"INFILTRATING DUCT CARCINOMA | BREAST, NOS",N/A ...,99,99
2866,P-0000012,15726,,Diagnosis,ICD Billing,174.1,BREAST CA-CENTRAL,,,
1607,P-0000012,21138,,Diagnosis,Tumor Registry,Tumor Registry,"ADENOCARCINOMA, NOS | LUNG, NOS",IIIB ...,3B,99
...,...,...,...,...,...,...,...,...,...,...
32459,P-0079240,23963,,Diagnosis,ICD Billing,193,THYROID CA,,,
17803,P-0079240,24865,,Diagnosis,Tumor Registry,Tumor Registry,ESSENTIAL THROMBOCYTHEMIA | BONE MARROW,NA ...,88,88
78710,P-0079241,23700,,Diagnosis,Tumor Registry,Tumor Registry,MULTIPLE MYELOMA | BONE MARROW,N/A ...,88,88
130440,P-0079241,23861,,Diagnosis,ICD Billing,C90.00,MULTIPLE MYELOMA NOT HAVING ACHIEVED REMISSION,,,


### Radiology report predictions

In [77]:
dict_cols = {'Adrenal_Glands': 'ADRENAL_GLAND', 
             'Biliary_Tract': 'BILIARY_TRACT', 
             'Bone': 'BONE', 
             'Bowel': 'BOWEL', 
             'CNS_Brain': 'CNS_BRAIN', 
             'Liver': 'LIVER', 
             'Lung': 'LUNG', 
             'Lymph_Nodes': 'LYMPH', 
             'Mediastinum': 'MEDIASTINUM', 
             'Intra.Abdominal': 'INTRA_ABDOMINAL', 
             'Pleura': 'PLEURA'
            }
# missing: 'BLADDER_OR_URINARY_TRACT', 'BREAST', 'GENITAL_MALE', 'GENITAL_FEMALE', 'HEAD_AND_NECK', 'KIDNEY', 'OTHER', 'OVARY', 'PERIPHERAL_NERVOUS_SYSTEM', 'SKIN'
cols_organs = list(dict_cols.values()) + ['GENITAL_FEMALE', 'GENITAL_MALE']

df_id_map = df_sample_summary[['DMP_ID', 'MRN']].drop_duplicates()
df_rad_pred_f = df_id_map.merge(right=df_rad_pred, how='right', on='DMP_ID')
df_rad_pred_f = df_rad_pred_f.merge(right=df_demo, how='left', on='MRN')

age_dx = (df_rad_pred_f['RADIOLOGY_PERFORMED_DATE'] - df_rad_pred_f['PT_BIRTH_DTE']).dt.days
df_rad_pred_f = df_rad_pred_f.assign(AGE_DX=age_dx)

df_rad_pred_f = df_rad_pred_f.rename(columns=dict_cols)
GENITAL_MALE = (df_rad_pred_f['Reproductive_Organs'] == 1) & (df_rad_pred_f['GENDER'] == 'MALE')
GENITAL_FEMALE = (df_rad_pred_f['Reproductive_Organs'] == 1) & (df_rad_pred_f['GENDER'] == 'FEMALE')
df_rad_pred_f = df_rad_pred_f.assign(GENITAL_MALE=GENITAL_MALE)
df_rad_pred_f = df_rad_pred_f.assign(GENITAL_FEMALE=GENITAL_FEMALE)


cols_id = ['DMP_ID', 'AGE_DX', 'PROCEDURE_TYPE']



df_rad_pred_f_melt = pd.melt(frame=df_rad_pred_f, id_vars=cols_id, value_vars=cols_organs, var_name='ANATOMIC_LOCATION', value_name='value')
df_rad_pred_f_melt['value'] = df_rad_pred_f_melt['value'].astype(int)


df_rad_pred_f_melt = df_rad_pred_f_melt.rename(columns={'PROCEDURE_TYPE': 'SOURCE_SPECIFIC',
                                                       'AGE_DX': 'START_DATE',
                                                       'DMP_ID': 'PATIENT_ID',
                                                       })

df_rad_pred_f_melt = df_rad_pred_f_melt[df_rad_pred_f_melt['value'] == 1]
df_rad_pred_f_melt = df_rad_pred_f_melt.drop(columns=['value'])


df_rad_pred_f_melt = df_rad_pred_f_melt.assign(SOURCE='MSK-IMPACT')
df_rad_pred_f_melt = df_rad_pred_f_melt.assign(STOP_DATE='')
df_rad_pred_f_melt = df_rad_pred_f_melt.assign(EVENT_TYPE='Diagnosis')
df_rad_pred_f_melt = df_rad_pred_f_melt.assign(DX_DESCRIPTION='NLP Derived')
df_rad_pred_f_melt = df_rad_pred_f_melt.assign(SOURCE='Radiology Report')





#### Non-LN Organs

In [81]:
df_rad_pred_f_melt_vis = df_rad_pred_f_melt[~df_rad_pred_f_melt['ANATOMIC_LOCATION'].isin(['LYMPH'])]
df_rad_pred_f_melt_vis = df_rad_pred_f_melt_vis.assign(SUBTYPE='Metastasis')
df_rad_pred_f_melt_vis = df_rad_pred_f_melt_vis[col_order]

In [82]:
df_rad_pred_f_melt_vis.head()

Unnamed: 0,PATIENT_ID,START_DATE,STOP_DATE,EVENT_TYPE,SUBTYPE,SOURCE,SOURCE_SPECIFIC,DX_DESCRIPTION,ANATOMIC_LOCATION
908,P-0000088,23360,,Diagnosis,Metastasis,Radiology Report,CT,NLP Derived,ADRENAL_GLAND
1017,P-0000103,17647,,Diagnosis,Metastasis,Radiology Report,CT,NLP Derived,ADRENAL_GLAND
1535,P-0000133,30226,,Diagnosis,Metastasis,Radiology Report,CT,NLP Derived,ADRENAL_GLAND
1536,P-0000133,30305,,Diagnosis,Metastasis,Radiology Report,CT,NLP Derived,ADRENAL_GLAND
1650,P-0000149,18527,,Diagnosis,Metastasis,Radiology Report,PET,NLP Derived,ADRENAL_GLAND


#### Lymph node annotations

In [84]:
df_rad_pred_f_melt_ln = df_rad_pred_f_melt[df_rad_pred_f_melt['ANATOMIC_LOCATION'].isin(['LYMPH'])]
df_rad_pred_f_melt_ln = df_rad_pred_f_melt_ln.assign(SUBTYPE='Lymph Nodes')
df_rad_pred_f_melt_ln = df_rad_pred_f_melt_ln[col_order]

In [85]:
df_rad_pred_f_melt_ln.head()

Unnamed: 0,PATIENT_ID,START_DATE,STOP_DATE,EVENT_TYPE,SUBTYPE,SOURCE,SOURCE_SPECIFIC,DX_DESCRIPTION,ANATOMIC_LOCATION
2310191,P-0000015,16221,,Diagnosis,Lymph Nodes,Radiology Report,PET,NLP Derived,LYMPH
2310196,P-0000015,16303,,Diagnosis,Lymph Nodes,Radiology Report,PET,NLP Derived,LYMPH
2310197,P-0000015,16303,,Diagnosis,Lymph Nodes,Radiology Report,CT,NLP Derived,LYMPH
2310287,P-0000026,25470,,Diagnosis,Lymph Nodes,Radiology Report,CT,NLP Derived,LYMPH
2310295,P-0000026,26192,,Diagnosis,Lymph Nodes,Radiology Report,CT,NLP Derived,LYMPH


## Load Mapping Objects

### Load mapping tables

In [12]:
# Load RDN mapping
obj_met_map_rdn = MetastaticSpreadMappingRND(path=const.pathname,
                                             fname_all_sites=const.fname_mapping_rdn_all_sites,
                                             fname_hematogenous=const.fname_mapping_rdn_hematogenous,
                                             fname_localext=const.fname_mapping_rdn_localext,
                                             fname_lymphatic=const.fname_mapping_rdn_lymphatic,
                                             fname_site_map=const.fname_mapping_rdn_site_map,
                                             fname_billing_map=const.fname_mapping_rdn_billing_map,
                                             fname_billing_code_dict=const.fname_mapping_rdn_to_billing_codes)


Loading mapping tables


In [13]:
const.pathname


'/mind_data/fongc2/organ-site-mapping/mappings'

### Load mapping object

In [14]:
# Load annoations object
obj_mapping = OrganMappingAnalysisRND(obj_met_map=obj_met_map_rdn)


In [15]:
obj_met_map_rdn.df_map_icd_mapping['tissue_icd_billing'].unique()


array(['ADRENAL_GLAND', 'BILIARY_TRACT', 'BLADDER_OR_URINARY_TRACT',
       'BONE', 'BOWEL', 'BREAST', 'CNS_BRAIN', 'GENITAL_MALE',
       'GENITAL_FEMALE', 'HEAD_AND_NECK', 'KIDNEY', 'LIVER', 'LUNG',
       'LYMPH', 'MEDIASTINUM', 'OTHER', 'OVARY',
       'PERIPHERAL_NERVOUS_SYSTEM', 'INTRA_ABDOMINAL', 'PLEURA', 'SKIN'],
      dtype=object)

## Apply mappings and format frames into cBioPortal timeline format
### Clinical Samples Table from cBioPortal

In [49]:
# Annotate IMPACT sample site data
df_met_sites_impact = obj_mapping.annotate_mapping_impact_met_samples(df_samples=df_samples_f, 
                                                                      col_primary_site=col_prim_site, 
                                                                      col_met_site=col_met_site, 
                                                                      label_dist_ln=True)

cols_keep = ['DMP_ID', 'AGE_DX', 'METASTATIC_SITE_BILLING_RDN', 'METASTATIC_SITE']
df_met_sites_impact = df_met_sites_impact[cols_keep]
df_met_sites_impact = df_met_sites_impact.assign(SOURCE='MSK-IMPACT')
df_met_sites_impact = df_met_sites_impact.assign(STOP_DATE='')
df_met_sites_impact = df_met_sites_impact.assign(EVENT_TYPE='Diagnosis')
df_met_sites_impact = df_met_sites_impact.assign(SOURCE='MSK-IMPACT')
df_met_sites_impact = df_met_sites_impact.assign(SOURCE_SPECIFIC='Pathology')

df_met_sites_impact = df_met_sites_impact.rename(columns={'DMP_ID': 'PATIENT_ID',
                                                          'METASTATIC_SITE_BILLING_RDN': 'ANATOMIC_LOCATION',
                                                          'AGE_DX': 'START_DATE',
                                                          'METASTATIC_SITE': 'DX_DESCRIPTION'
                                                         })


In [11]:
df_met_sites_impact.head()

NameError: name 'df_met_sites_impact' is not defined

#### Non-LN organs

In [51]:
df_met_sites_impact_vis = df_met_sites_impact[~df_met_sites_impact['ANATOMIC_LOCATION'].isin(['LYMPH'])]

df_met_sites_impact_vis = df_met_sites_impact_vis.assign(SUBTYPE='Metastasis')
df_met_sites_impact_vis = df_met_sites_impact_vis[col_order]

#### Lymph node annotations

In [53]:
df_met_sites_impact_ln = df_met_sites_impact[df_met_sites_impact['ANATOMIC_LOCATION'].isin(['LYMPH'])]
df_met_sites_impact_ln = df_met_sites_impact_ln.assign(SUBTYPE='Lymph Nodes')
df_met_sites_impact_ln = df_met_sites_impact_ln[col_order]

In [10]:
df_met_sites_impact_ln.head()

NameError: name 'df_met_sites_impact_ln' is not defined

### ICD Billing table 

In [None]:
# Annotate diagnosis table of ICD billings with renzo's mapping of metastatic sites
df_met_sites_dx = obj_mapping.annotate_icd_billing_met_dx(df_dx_mets=df_dx_billing_f, 
                                                          col_icd_billing=col_icd_billing, 
                                                          col_sex=col_sex,
                                                          add_msk_met_anno=True)

df_met_sites_dx['Diagnosis Description'] = df_met_sites_dx['ICD-9/10 Dx Code'] + ' - ' + df_met_sites_dx['Diagnosis Description']
df_met_sites_dx = df_met_sites_dx.drop(columns=['ICD-9/10 Dx Code', 'GENDER', 'METASTATIC_SITE_RDN_MAP'])

df_met_sites_dx = df_met_sites_dx.assign(SOURCE='ICD Billing')
df_met_sites_dx = df_met_sites_dx.assign(STOP_DATE='')
df_met_sites_dx = df_met_sites_dx.assign(EVENT_TYPE='Diagnosis')
df_met_sites_dx = df_met_sites_dx.assign(SOURCE_SPECIFIC='ICD Billing')

df_met_sites_dx = df_met_sites_dx.rename(columns={'DMP_ID': 'PATIENT_ID',
                                                  'tissue_icd_billing': 'ANATOMIC_LOCATION',
                                                  'AGE_DX': 'START_DATE',
                                                  'Diagnosis Description': 'DX_DESCRIPTION'
                                                  }
                                        )
df_met_sites_dx

In [None]:
print(df_dx_billing_f.shape)
print(df_met_sites_dx.shape)


#### Non-LN organs

In [None]:
df_met_sites_dx_vis = df_met_sites_dx[~df_met_sites_dx['ANATOMIC_LOCATION'].isin(['LYMPH', 'OTHER'])]

df_met_sites_dx_vis = df_met_sites_dx_vis.assign(SUBTYPE='Metastasis')
df_met_sites_dx_vis = df_met_sites_dx_vis[col_order]

In [None]:
df_met_sites_dx_vis.head()

#### Lymph node annotations

In [None]:
df_met_sites_dx_ln = df_met_sites_dx[df_met_sites_dx['ANATOMIC_LOCATION'].isin(['LYMPH', 'OTHER'])]

df_met_sites_dx_ln = df_met_sites_dx_ln.assign(SUBTYPE='Lymph Nodes')
df_met_sites_dx_ln = df_met_sites_dx_ln[col_order]

In [None]:
# df_met_rdn_anno.head(50)
# t = df_met_rdn_anno.groupby('DMP_ID')['CANCER_TYPE'].nunique()
# df_met_rdn_anno[df_met_rdn_anno['DMP_ID'].isin(t[t > 1].index)].head(50)

df_met_sites_dx_ln.head()

## Merge dataframes

In [None]:
df_mets_f = pd.concat([df_met_sites_impact, df_met_sites_dx_vis], axis=0, sort=False)
df_mets_f = df_mets_f.sort_values(by=['PATIENT_ID', 'START_DATE', 'ANATOMIC_LOCATION']).reset_index(drop=True)

In [None]:
df_mets_f.head(50)

## Save annotations

In [None]:
# Save RDN annotations
# df_mets_f.to_csv(pathfilename_out, index=False)
obj_minio.save_obj(df=df_mets_f, bucket_name=BUCKET, path_object='sandbox/fongc2/dx_met_timeline.txt', sep='\t')
