# met_site_data_creation_clinical.py

### By Chris Fong - MSKCC 2020

This script will create a standardized table of metastatic sites for for a given 
- Diagnosis table containing ICD billing codes and 
- Clinical sample table downloaded from the cohort tab in cBioPortal 

The script can compute if a distant LN metastasis occurred for that patient, given their cancer type according to IMPACT data. Similarly, local extension is computed for viseral organs.

If a patient has multiple dx from IMPACT, more than 1 row will exist for that patient, and should be removed.



## Load Libraries

In [1]:
import sys  
sys.path.insert(0, 'mappings')
sys.path.insert(0, 'analysis')
import os
import pandas as pd
import constants_o_sites as const
from organ_mapping_analysis import OrganMappingAnalysisRND
from organ_mapping_rdn_processing import MetastaticSpreadMappingRND

In [2]:
# Console settings
def set_debug_console():
    desired_width = 320
    pd.set_option('display.width', desired_width)
    pd.set_option('display.max_rows', 250)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)
    pd.set_option('display.expand_frame_repr', False)

set_debug_console()

## Load Data

### Define columns being used

In [18]:
## Load Data
fname_save_anno = 'metatrop_met_site_annotations_clinical.csv'

col_icd_billing = 'ICD-9/10 Dx Code'
col_id = 'SAMPLE_ID'
col_id2 = 'DMP_ID'
col_ct = 'CANCER_TYPE'
col_sex = 'SEX'
col_prim_site = 'PRIMARY_SITE'
col_met_site = 'METASTATIC_SITE'
col_sample_type = 'SAMPLE_TYPE'

### Load clinical sample file from cbioportal

In [4]:
## Load cbioportal clinical data file --------------
# Load ID names
fname = 'mskimpact_clinical_data.tsv'
path = '/Users/fongc2/Documents/github/MSK/DARWIN_ETL/data'
pathfilename1 = os.path.join(path, fname)
df_samples1 = pd.read_csv(pathfilename1, header=0, low_memory=False, sep='\t')
# For genie, fix ids
col_rep = {'Patient ID': col_id2,
           'Sample ID': col_id,
           'Sex': col_sex,
           'Cancer Type': col_ct,
           'Primary Tumor Site': col_prim_site,
           'Metastatic Site': col_met_site,
           'Sample Type': col_sample_type}
df_samples1 = df_samples1.rename(columns=col_rep)
df_samples = df_samples1[list(col_rep.values())]

In [5]:
df_samples.head()

Unnamed: 0,DMP_ID,SAMPLE_ID,SEX,CANCER_TYPE,PRIMARY_SITE,METASTATIC_SITE,SAMPLE_TYPE
0,P-0000004,P-0000004-T01-IM3,Female,Breast Cancer,Breast,,Primary
1,P-0000012,P-0000012-T02-IM3,Female,Breast Cancer,Breast,,Primary
2,P-0000012,P-0000012-T03-IM3,Female,Non-Small Cell Lung Cancer,Lung,Neck,Metastasis
3,P-0000012,P-0000012-T04-IM6,Female,Non-Small Cell Lung Cancer,Lung,,Primary
4,P-0000015,P-0000015-T01-IM3,Female,Breast Cancer,Breast,Liver,Metastasis


### Load ICD Billing Patient Diagnosis Data 

In [6]:
# Load met sites directly from diagnosis
fname = 'table_diagnosis_clean.csv'
pathfilename1 = os.path.join(path, fname)
df_dx = pd.read_csv(pathfilename1, header=0, low_memory=False, sep=',')
df_dx_mets1 = df_dx[df_dx['IS_MET_ICD_BILLING'] == True]
list_col_index = ['DMP_ID', 'AGE_AT_EVENT', col_icd_billing, 'DX_DESCRIPTION']
df_dx_mets1 = df_dx_mets1### Load clinical sample file from cbioportal[list_col_index]

#### Add column for sex

In [7]:
# Add column for sex
df_dx_mets = df_dx_mets1.merge(right=df_samples[[col_id2, col_sex]].drop_duplicates(), how='left', on='DMP_ID')

In [8]:
df_dx_mets.head()

Unnamed: 0,DMP_ID,AGE_AT_EVENT,AGE_AT_FIRST_RECUR,DX_YEAR,DX_DESCRIPTION,IS_MALIGNANT_PRIMARY,IS_ICDO_DX,IS_RECURRENT,IS_STAGE_IV_ICDO,IS_MET_ICD_BILLING,IS_MET_ICD_BILLING_NON_LN,IS_MET_ICD_BILLING_LN,IS_MET_ICD_BILLING_OTHER,IS_NON_CANCER,IS_INVALID_LABEL,CASE_STATUS_DESC,TM_DX_CONFRM_DESC,ICDO_IS_LOCAL,ICDO_IS_REGIONAL_GENERAL,ICDO_IS_DISTANT,ICDO_IS_IN_SITU,ICDO_IS_REGIONAL_LYMPH,ICDO_IS_REGIONAL_DIRECT_EXT,CATEGORY1,CATEGORY2,CATEGORY3,CATEGORY4,TM_CLIN_TNM_M,TM_PATH_TNM_M,TM_MSTAGE_DESC,TM_SITE_DESC,TM_CLIN_TNM_T,TM_TSTAGE_DESC,MSK Stage,PT_BIRTH_DTE,ICDO_IS_STAGE_NA,TM_LATERALITY_DESC,TUMOR_MSK_CLASS_OF_CASE,TM_TUMOR_SEQ_DESC,TM_CSMETDX,TM_PATH_TEXT,REASON_NO_DATA,IS_M1_PATH_ICDO,AGE_AT_TM_NON_CA_SURG_DATE_IN_DAYS,TM_PATH_TNM_T,TM_NSTAGE_DESC,Path Group,ICD-O Site Code,TM_CLIN_STG_GRP,AGE_AT_TM_CA_SURG_DATE_IN_DAYS,AJCC,Source,TM_RESID_TUMOR_DESC,ICDO_IS_UNSTAGED,ICD-O Site Desc,IS_M1_CLINICAL_ICDO,TM_MSK_STG,Clinical Group,TM_PATH_STG_GRP,TM_NON_CA_SURG_SUM_DESC,TM_AJCC,TM_CSMETEV,TM_SURG_TEXT,TM_AJCC_DESC,ICD-9/10 Dx Code,TM_CLIN_TNM_N,DATA_AVAILABLE_DDP,TM_TUMOR_SEQ,ICD-O Histology Code,TM_PATH_TNM_N,AGE_AT_FIRST_ICD9_DAYS,SEX
0,P-0000004,14484.0,,,[u'197.7 - LIVER METS'],False,False,False,False,True,True,False,False,False,False,,,False,False,False,False,False,False,Miscellaneous,Miscellaneous,Miscellaneous,Miscellaneous,,,,,,,,1974-04-11,False,,,,,,,False,,,,,,,,,Billing,,False,,False,,,,,,,,,197.7,,True,,,,14484.0,Female
1,P-0000004,14484.0,,,[u'198.5 - BONE METS'],False,False,False,False,True,True,False,False,False,False,,,False,False,False,False,False,False,Miscellaneous,Miscellaneous,Miscellaneous,Miscellaneous,,,,,,,,1974-04-11,False,,,,,,,False,,,,,,,,,Billing,,False,,False,,,,,,,,,198.5,,True,,,,14484.0,Female
2,P-0000012,21138.0,,,[u'196.0 - LYMPH NODE METS-NECK'],False,False,False,False,True,False,True,False,False,False,,,False,False,False,False,False,False,Miscellaneous,Miscellaneous,Miscellaneous,Miscellaneous,,,,,,,,1955-11-11,False,,,,,,,False,,,,,,,,,Billing,,False,,False,,,,,,,,,196.0,,True,,,,21138.0,Female
3,P-0000012,21213.0,,,[u'196.1 - LYMPH NODE METS-INTRATHOR'],False,False,False,False,True,False,True,False,False,False,,,False,False,False,False,False,False,Miscellaneous,Miscellaneous,Miscellaneous,Miscellaneous,,,,,,,,1955-11-11,False,,,,,,,False,,,,,,,,,Billing,,False,,False,,,,,,,,,196.1,,True,,,,21213.0,Female
4,P-0000012,21500.0,,,[u'197.6 - MET CA-PERITONEUM'],False,False,False,False,True,True,False,False,False,False,,,False,False,False,False,False,False,Miscellaneous,Miscellaneous,Miscellaneous,Miscellaneous,,,,,,,,1955-11-11,False,,,,,,,False,,,,,,,,,Billing,,False,,False,,,,,,,,,197.6,,True,,,,21500.0,Female


## Load Mapping Objects

### Load mapping tables

In [9]:
## Load Mapping Objects### Load mapping tables# Load RDN mapping
obj_met_map_rdn = MetastaticSpreadMappingRND(path=const.pathname,
                                             fname_all_sites=const.fname_mapping_rdn_all_sites,
                                             fname_hematogenous=const.fname_mapping_rdn_hematogenous,
                                             fname_localext=const.fname_mapping_rdn_localext,
                                             fname_lymphatic=const.fname_mapping_rdn_lymphatic,
                                             fname_site_map=const.fname_mapping_rdn_site_map,
                                             fname_billing_map=const.fname_mapping_rdn_billing_map,
                                             fname_billing_code_dict=const.fname_mapping_rdn_to_billing_codes)

Loading mapping tables


### Load mapping object

In [10]:
### Load mapping object# Load annoations object
obj_mapping = OrganMappingAnalysisRND(obj_met_map=obj_met_map_rdn)

## Create Mapping based on Clinical Sample Table (Primary Site) and ICD Billing Codes (Metastatic Site)

In [13]:
# Annotate diagnosis table of ICD billings with renzo's mapping of metastatic sites
df_met_sites_dx = obj_mapping.annotate_icd_billing_met_dx(df_dx_mets=df_dx_mets, col_icd_billing=col_icd_billing, col_sex=col_sex)


df_met_rdn_anno = obj_mapping.create_sample_to_icd_billing_met_mapping(df_dx=df_dx_mets,
                                                                       col_icd_billing=col_icd_billing,
                                                                       col_sex=col_sex,
                                                             ## Create Mapping on Clinical Samples Table from cBioPortal          col_sex=col_sex, 
                                                                       df_samples=df_samples,
                                                                       col_primary_site=col_prim_site,
                                                                       col_met_site=col_met_site,
                                                                       label_dist_ln=True)

In [14]:
df_met_rdn_anno.head()

Unnamed: 0,SAMPLE_ID,DMP_ID,CANCER_TYPE,PRIMARY_SITE,METASTATIC_SITE,SEX,PRIMARY_SITE_RDN_MAP,AGE_AT_EVENT,AGE_AT_FIRST_RECUR,DX_YEAR,DX_DESCRIPTION,IS_MALIGNANT_PRIMARY,IS_ICDO_DX,IS_RECURRENT,IS_STAGE_IV_ICDO,IS_MET_ICD_BILLING,IS_MET_ICD_BILLING_NON_LN,IS_MET_ICD_BILLING_LN,IS_MET_ICD_BILLING_OTHER,IS_NON_CANCER,IS_INVALID_LABEL,CASE_STATUS_DESC,TM_DX_CONFRM_DESC,ICDO_IS_LOCAL,ICDO_IS_REGIONAL_GENERAL,ICDO_IS_DISTANT,ICDO_IS_IN_SITU,ICDO_IS_REGIONAL_LYMPH,ICDO_IS_REGIONAL_DIRECT_EXT,CATEGORY1,CATEGORY2,CATEGORY3,CATEGORY4,TM_CLIN_TNM_M,TM_PATH_TNM_M,TM_MSTAGE_DESC,TM_SITE_DESC,TM_CLIN_TNM_T,TM_TSTAGE_DESC,MSK Stage,PT_BIRTH_DTE,ICDO_IS_STAGE_NA,TM_LATERALITY_DESC,TUMOR_MSK_CLASS_OF_CASE,TM_TUMOR_SEQ_DESC,TM_CSMETDX,TM_PATH_TEXT,REASON_NO_DATA,IS_M1_PATH_ICDO,AGE_AT_TM_NON_CA_SURG_DATE_IN_DAYS,TM_PATH_TNM_T,TM_NSTAGE_DESC,Path Group,ICD-O Site Code,TM_CLIN_STG_GRP,AGE_AT_TM_CA_SURG_DATE_IN_DAYS,AJCC,Source,TM_RESID_TUMOR_DESC,ICDO_IS_UNSTAGED,ICD-O Site Desc,IS_M1_CLINICAL_ICDO,TM_MSK_STG,Clinical Group,TM_PATH_STG_GRP,TM_NON_CA_SURG_SUM_DESC,TM_AJCC,TM_CSMETEV,TM_SURG_TEXT,TM_AJCC_DESC,ICD-9/10 Dx Code,TM_CLIN_TNM_N,DATA_AVAILABLE_DDP,TM_TUMOR_SEQ,ICD-O Histology Code,TM_PATH_TNM_N,AGE_AT_FIRST_ICD9_DAYS,METASTATIC_SITE_RDN_MAP,LYMPH_SPREAD,LOCAL_EXTENSION,origin,hematogenous_grouping,METASTATIC_SITE_ONCOTREE_RDN,METASTATIC_SITE_BILLING_RDN
0,P-0000004-T01-IM3,P-0000004,Breast Cancer,Breast,,Female,chest_breast,14484.0,,,[u'197.7 - LIVER METS'],False,False,False,False,True,True,False,False,False,False,,,False,False,False,False,False,False,Miscellaneous,Miscellaneous,Miscellaneous,Miscellaneous,,,,,,,,1974-04-11,False,,,,,,,False,,,,,,,,,Billing,,False,,False,,,,,,,,,197.7,,True,,,,14484.0,abdomen_liver,,,primary&mets,LIVER,Liver,LIVER
1,P-0000004-T01-IM3,P-0000004,Breast Cancer,Breast,,Female,chest_breast,14484.0,,,[u'198.5 - BONE METS'],False,False,False,False,True,True,False,False,False,False,,,False,False,False,False,False,False,Miscellaneous,Miscellaneous,Miscellaneous,Miscellaneous,,,,,,,,1974-04-11,False,,,,,,,False,,,,,,,,,Billing,,False,,False,,,,,,,,,198.5,,True,,,,14484.0,undeterminate_bone,,,primary&mets,NON_PORTAL,Bone,BONE
2,P-0000012-T02-IM3,P-0000012,Breast Cancer,Breast,,Female,chest_breast,21138.0,,,[u'196.0 - LYMPH NODE METS-NECK'],False,False,False,False,True,False,True,False,False,False,,,False,False,False,False,False,False,Miscellaneous,Miscellaneous,Miscellaneous,Miscellaneous,,,,,,,,1955-11-11,False,,,,,,,False,,,,,,,,,Billing,,False,,False,,,,,,,,,196.0,,True,,,,21138.0,neck_node,DISTANT,,mets,NON_PORTAL,Distant Lymphatic,DIST_LYMPH
3,P-0000012-T02-IM3,P-0000012,Breast Cancer,Breast,,Female,chest_breast,21213.0,,,[u'196.1 - LYMPH NODE METS-INTRATHOR'],False,False,False,False,True,False,True,False,False,False,,,False,False,False,False,False,False,Miscellaneous,Miscellaneous,Miscellaneous,Miscellaneous,,,,,,,,1955-11-11,False,,,,,,,False,,,,,,,,,Billing,,False,,False,,,,,,,,,196.1,,True,,,,21213.0,mediastinum_node,DISTANT,,primary&mets,NON_PORTAL,Distant Lymphatic,DIST_LYMPH
4,P-0000012-T02-IM3,P-0000012,Breast Cancer,Breast,,Female,chest_breast,21500.0,,,[u'197.6 - MET CA-PERITONEUM'],False,False,False,False,True,True,False,False,False,False,,,False,False,False,False,False,False,Miscellaneous,Miscellaneous,Miscellaneous,Miscellaneous,,,,,,,,1955-11-11,False,,,,,,,False,,,,,,,,,Billing,,False,,False,,,,,,,,,197.6,,True,,,,21500.0,abdomen_peritoneum.serosa,,,primary&mets,PORTAL,Peritoneum,PERITONEUM


## Save annotations

In [19]:
## Save annotations
df_met_rdn_anno.to_csv(fname_save_anno, index=False)

## Create Binary Matrix from Mapping

In [16]:
df_binary = obj_mapping.create_binary_met_sites(df=df_met_rdn_anno, col_index='SAMPLE_ID', col_count='DMP_ID', col_met_site='METASTATIC_SITE_BILLING_RDN')


In [17]:
df_binary.head(100)

METASTATIC_SITE_BILLING_RDN,SAMPLE_ID,HAS_MET_ADRENAL_GLAND,HAS_MET_BILIARY_TRACT,HAS_MET_BLADDER_OR_URINARY_TRACT,HAS_MET_BONE,HAS_MET_BOWEL,HAS_MET_BREAST,HAS_MET_CNS_BRAIN,HAS_MET_DIST_LYMPH,HAS_MET_GENITAL_FEMALE,HAS_MET_GENITAL_MALE,HAS_MET_HEAD_AND_NECK,HAS_MET_KIDNEY,HAS_MET_LIVER,HAS_MET_LUNG,HAS_MET_LYMPH,HAS_MET_MEDIASTINUM,HAS_MET_OTHER,HAS_MET_OVARY,HAS_MET_PERIPHERAL_NERVOUS_SYSTEM,HAS_MET_PERITONEUM,HAS_MET_PLEURA,HAS_MET_SKIN
0,P-0000004-T01-IM3,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,P-0000012-T02-IM3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0
2,P-0000012-T03-IM3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0
3,P-0000012-T04-IM6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0
4,P-0000015-T01-IM3,0,0,0,1,0,0,1,1,0,0,0,0,1,1,1,0,1,1,0,0,1,0
5,P-0000023-T01-IM3,0,1,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,0,0,1,0,0
6,P-0000024-T01-IM3,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,1,0,0
7,P-0000025-T01-IM3,1,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0
8,P-0000025-T02-IM5,1,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0
9,P-0000025-T03-IM6,1,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0
