# metatrop_metastatic_sites_combined.py

### By Chris Fong - MSKCC 2020

This script will create a standardized table of metastatic sites for for a given 
- Diagnosis table containing ICD billing codes and 
- Clinical sample table downloaded from the GENIE V9 dataset 

The script can compute if a distant LN metastasis occurred for that patient, given their cancer type according to IMPACT data. Similarly, local extension is computed for viseral organs.

If a patient has multiple dx from IMPACT, more than 1 row will exist for that patient, and should be removed.



## Load Libraries

In [1]:
import sys  
sys.path.insert(0, '../mappings')
sys.path.insert(0, '../analysis')
sys.path.insert(0, '../')
import os
import pandas as pd
import constants_o_sites as const
from organ_mapping_analysis import OrganMappingAnalysisRND
from organ_mapping_rdn_processing import MetastaticSpreadMappingRND

In [2]:
# Console settings
def set_debug_console():
    desired_width = 320
    pd.set_option('display.width', desired_width)
    pd.set_option('display.max_rows', 250)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)
    pd.set_option('display.expand_frame_repr', False)

set_debug_console()

## Load Data

##### Define columns being used

In [3]:
## Load Data
fname_save_anno = 'metatrop_met_site_annotations_clinical_and_impact_20201104.csv'
path_save = '/Users/fongc2/Documents/github/MSK/clinical_data_mining/organ-site-mapping/metastatic_tropism_work'
pathfilename_save = os.path.join(path_save, fname_save_anno)

# File for ICD billing
col_icd_billing = 'ICD-9/10 Dx Code'
col_icd_billing_desc = 'Diagnosis'
col_id = 'SAMPLE_ID'
col_id2 = 'DMP_ID'
# File for cBioPortal data
col_ct = 'CANCER_TYPE'
col_prim_site = 'PRIMARY_SITE'
col_met_site = 'METASTATIC_SITE'
col_sample_type = 'SAMPLE_TYPE'
col_sex = 'SEX'
col_date = 'Diagnosis Date'

##### Load GENIE data from cbioportal 

In [4]:
fname = 'genie_private_clinical_data.tsv'
path = '/Users/fongc2/Documents/github/MSK/DARWIN_ETL/data/'
pathfilename1 = os.path.join(path, fname)
df_samples1 = pd.read_csv(pathfilename1, header=0, low_memory=False, sep='\t')

##### Load IMPACT data from cbioportal to obtain primary and metastatic site info

In [5]:
fname = 'mskimpact_clinical_data.tsv'
path = '/Users/fongc2/Documents/github/MSK/DARWIN_ETL/data/'
pathfilename1 = os.path.join(path, fname)
df_impact = pd.read_csv(pathfilename1, header=0, low_memory=False, sep='\t')

##### Load ICD Billing Patient Diagnosis Data 

In [6]:
# Load met sites directly from diagnosis
fname = 'table_diagnosis_clean.csv'
path = '/Users/fongc2/Documents/github/MSK/DARWIN_ETL/data/'
pathfilename1 = os.path.join(path, fname)
df_dx = pd.read_csv(pathfilename1, header=0, low_memory=False, sep=',')

In [7]:
list(df_dx.columns)

['DMP_ID',
 'P_ID',
 'ICD-9/10 Dx Code',
 'ICD-O Site Code',
 'ICD-O Site Desc',
 'ICD-O Histology Code',
 'ICD-O Histology Desc',
 'Clinical Group',
 'Path Group',
 'AJCC',
 'Summary',
 'MSK Stage',
 'Diagnosis Type',
 'CATEGORY1',
 'CATEGORY2',
 'CATEGORY3',
 'CATEGORY4',
 'Source',
 'Diagnosis',
 'Diagnosis Date',
 'REASON_NO_DATA',
 'DATA_AVAILABLE_DDP',
 'IS_MET_ICD_BILLING',
 'IS_MET_ICD_BILLING_OTHER',
 'IS_MET_ICD_BILLING_LN',
 'IS_MET_ICD_BILLING_NON_LN',
 'IS_NON_CANCER',
 'IS_INVALID_LABEL',
 'CASE_STATUS_DESC',
 'IS_ICDO_DX',
 'IS_STAGE_IV_ICDO',
 'ICDO_IS_LOCAL',
 'ICDO_IS_REGIONAL_GENERAL',
 'ICDO_IS_DISTANT',
 'ICDO_IS_IN_SITU',
 'ICDO_IS_REGIONAL_LYMPH',
 'ICDO_IS_REGIONAL_DIRECT_EXT',
 'ICDO_IS_UNSTAGED',
 'ICDO_IS_STAGE_NA',
 'IS_MALIGNANT_PRIMARY']

## Clean Data

### Clean clinical sample file from cbioportal

##### Fix header lables

In [8]:
# Headers to replace
col_rep = {'Patient ID': col_id2,
           'Sample ID': col_id,
           'Sex': col_sex,
           'Cancer Type': col_ct,
           'Primary Tumor Site': col_prim_site,
           'Metastatic Site': col_met_site,
           'Sample Type': col_sample_type}

In [9]:
# Genie data
df_samples1 = df_samples1.rename(columns=col_rep)
cols_genie = set.intersection(set(list(col_rep.values())), set(df_samples1.columns))
cols_genie = [x for x in df_samples1.columns if x in cols_genie]
df_samples = df_samples1[cols_genie].copy()
df_samples[col_id] = df_samples[col_id].str.replace('GENIE-MSK-', '')
df_samples[col_id2] = df_samples[col_id2].str.replace('GENIE-MSK-', '')

In [10]:
df_samples.head(2)

Unnamed: 0,DMP_ID,SAMPLE_ID,CANCER_TYPE,SAMPLE_TYPE,SEX
0,P-0000004,P-0000004-T01-IM3,Breast Cancer,Primary,Female
1,P-0000012,P-0000012-T02-IM3,Breast Cancer,Primary,Female


In [11]:
# IMPACT data
df_impact1 = df_impact.rename(columns=col_rep)
df_samples_impact = df_impact1[[col_id, col_prim_site, col_met_site]]

In [12]:
df_samples_impact.head()

Unnamed: 0,SAMPLE_ID,PRIMARY_SITE,METASTATIC_SITE
0,P-0000004-T01-IM3,Breast,
1,P-0000012-T02-IM3,Breast,
2,P-0000012-T03-IM3,Lung,Neck
3,P-0000012-T04-IM6,Lung,
4,P-0000015-T01-IM3,Breast,Liver


##### Merge GENIE and IMPACT data sets

In [13]:
df_samples_f = df_samples.merge(right=df_samples_impact, how='left', on='SAMPLE_ID')

##### Create dataframe for metastatic sites only from clinical sample file

In [14]:
df_metatrop_met = df_samples_f[df_samples_f[col_sample_type] == 'Metastasis']


##### Add columns from ICD billing table (dx date, dx description, ICD code)

In [15]:
df_metatrop_met[col_date] = pd.np.NaN
df_metatrop_met[col_icd_billing] = pd.np.NaN
df_metatrop_met[col_icd_billing_desc] = 'pathology'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [16]:
df_metatrop_met.head()

Unnamed: 0,DMP_ID,SAMPLE_ID,CANCER_TYPE,SAMPLE_TYPE,SEX,PRIMARY_SITE,METASTATIC_SITE,Diagnosis Date,ICD-9/10 Dx Code,Diagnosis
2,P-0000012,P-0000012-T03-IM3,Non-Small Cell Lung Cancer,Metastasis,Female,Lung,Neck,,,pathology
4,P-0000015,P-0000015-T01-IM3,Breast Cancer,Metastasis,Female,Breast,Liver,,,pathology
6,P-0000024,P-0000024-T01-IM3,Endometrial Cancer,Metastasis,Female,Uterus,Lung,,,pathology
8,P-0000025,P-0000025-T02-IM5,Endometrial Cancer,Metastasis,Female,Uterus,Peritoneum,,,pathology
9,P-0000025,P-0000025-T03-IM6,Endometrial Cancer,Metastasis,Female,Uterus,Liver,,,pathology


The table above is now in the required format for mapping metastatic sites from IMPACT free text notations to a standard naming convention.

### Clean clinical diagnosis data from Darwin

##### Filter by metastatic ICD billing codes and select relevant columns

In [17]:
cols_select = [col_id2, col_date, col_icd_billing, col_icd_billing_desc, 'IS_MET_ICD_BILLING']
df_dx_mets1 = df_dx.loc[df_dx['IS_MET_ICD_BILLING'] == True, cols_select]


In [18]:
df_dx_mets1.head()

Unnamed: 0,DMP_ID,Diagnosis Date,ICD-9/10 Dx Code,Diagnosis,IS_MET_ICD_BILLING
1,P-0000004,2013-12-06,197.7,['197.7 - LIVER METS'],True
2,P-0000004,2013-12-06,198.5,['198.5 - BONE METS'],True
6,P-0000012,2013-09-25,196.0,['196.0 - LYMPH NODE METS-NECK'],True
8,P-0000012,2013-12-09,196.1,['196.1 - LYMPH NODE METS-INTRATHOR'],True
9,P-0000012,2014-09-22,197.6,['197.6 - MET CA-PERITONEUM'],True


##### Merge with info on gender/sex

In [19]:
df_dx_mets = df_dx_mets1.merge(right=df_samples_f[[col_id2, col_sex, col_prim_site, col_met_site]], how='left', on=col_id2)
df_dx_mets.head()

Unnamed: 0,DMP_ID,Diagnosis Date,ICD-9/10 Dx Code,Diagnosis,IS_MET_ICD_BILLING,SEX,PRIMARY_SITE,METASTATIC_SITE
0,P-0000004,2013-12-06,197.7,['197.7 - LIVER METS'],True,Female,Breast,
1,P-0000004,2013-12-06,198.5,['198.5 - BONE METS'],True,Female,Breast,
2,P-0000012,2013-09-25,196.0,['196.0 - LYMPH NODE METS-NECK'],True,Female,Breast,
3,P-0000012,2013-09-25,196.0,['196.0 - LYMPH NODE METS-NECK'],True,Female,Lung,Neck
4,P-0000012,2013-09-25,196.0,['196.0 - LYMPH NODE METS-NECK'],True,Female,Lung,


## Map IMPACT and Darwin Clinical Diagnoses Data to Standard Organ Sites

### Load Mapping Objects

##### Load mapping tables

In [20]:
## Load Mapping Objects### Load mapping tables# Load RDN mapping
obj_met_map_rdn = MetastaticSpreadMappingRND(path='../' + const.pathname,
                                             fname_all_sites=const.fname_mapping_rdn_all_sites,
                                             fname_hematogenous=const.fname_mapping_rdn_hematogenous,
                                             fname_localext=const.fname_mapping_rdn_localext,
                                             fname_lymphatic=const.fname_mapping_rdn_lymphatic,
                                             fname_site_map=const.fname_mapping_rdn_site_map,
                                             fname_billing_map=const.fname_mapping_rdn_billing_map,
                                             fname_billing_code_dict=const.fname_mapping_rdn_to_billing_codes)

Loading mapping tables


##### Load mapping object

In [21]:
### Load mapping object# Load annoations object
obj_mapping = OrganMappingAnalysisRND(obj_met_map=obj_met_map_rdn)

### Run mapping objects

##### Create Mapping based on Clinical Sample Table (Primary Site) and ICD Billing Codes (Metastatic Site)

In [22]:
# Annotate diagnosis table of ICD billings with renzo's mapping of metastatic sites
df_met_sites_dx = obj_mapping.annotate_icd_billing_met_dx(df_dx_mets=df_dx_mets, col_icd_billing=col_icd_billing, col_sex=col_sex)


df_met_rdn_anno = obj_mapping.create_sample_to_icd_billing_met_mapping(df_dx=df_dx_mets,
                                                                       col_icd_billing=col_icd_billing,
                                                                       col_sex=col_sex,
                                                                       df_samples=df_samples_f,
                                                                       col_primary_site=col_prim_site,
                                                                       col_met_site=col_met_site,
                                                                       label_dist_ln=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [23]:
df_met_rdn_anno.head(3)

Unnamed: 0,SAMPLE_ID,DMP_ID,CANCER_TYPE,PRIMARY_SITE_x,METASTATIC_SITE_x,SEX,PRIMARY_SITE_RDN_MAP,Diagnosis Date,ICD-9/10 Dx Code,Diagnosis,IS_MET_ICD_BILLING,PRIMARY_SITE_y,METASTATIC_SITE_y,METASTATIC_SITE_RDN_MAP,LYMPH_SPREAD,LOCAL_EXTENSION,hematogenous_grouping,METASTATIC_SITE_ONCOTREE_RDN,METASTATIC_SITE_BILLING_RDN
0,P-0000004-T01-IM3,P-0000004,Breast Cancer,Breast,,Female,chest_breast,2013-12-06,197.7,['197.7 - LIVER METS'],True,Breast,,abdomen_liver,,,LIVER,Liver,LIVER
1,P-0000004-T01-IM3,P-0000004,Breast Cancer,Breast,,Female,chest_breast,2013-12-06,198.5,['198.5 - BONE METS'],True,Breast,,undeterminate_bone,,,NON_PORTAL,Bone,BONE
2,P-0000012-T02-IM3,P-0000012,Breast Cancer,Breast,,Female,chest_breast,2013-09-25,196.0,['196.0 - LYMPH NODE METS-NECK'],True,Breast,,neck_node,DISTANT,,NON_PORTAL,Distant Lymphatic,DIST_LYMPH


##### Create Mapping on GENIE's Clinical Samples Table from cBioPortal

In [24]:
# Annotate IMPACT sample site data
df_met_sites_impact = obj_mapping.annotate_mapping_impact_met_samples(df_samples=df_metatrop_met, 
                                                                      col_primary_site=col_prim_site, 
                                                                      col_met_site=col_met_site, 
                                                                      label_dist_ln=True)

df_met_sites_impact.head(3)

Unnamed: 0,DMP_ID,SAMPLE_ID,CANCER_TYPE,SAMPLE_TYPE,SEX,PRIMARY_SITE,METASTATIC_SITE,Diagnosis Date,ICD-9/10 Dx Code,Diagnosis,PRIMARY_SITE_RDN_MAP,PRIMARY_SITE_RDN_MAP_MAIN,PRIMARY_SITE_RDN_MAP_SECONDARY,METASTATIC_SITE_RDN_MAP,METASTATIC_SITE_RDN_MAP_MAIN,METASTATIC_SITE_RDN_MAP_SECONDARY,LYMPH_SPREAD,LOCAL_EXTENSION,hematogenous_grouping,METASTATIC_SITE_ONCOTREE_RDN,METASTATIC_SITE_BILLING_RDN
0,P-0000012,P-0000012-T03-IM3,Non-Small Cell Lung Cancer,Metastasis,Female,Lung,Neck,,,pathology,chest_lung,chest,lung,neck_unknown,neck,unknown,,,NON_PORTAL,Head and Neck,HEAD_AND_NECK
1,P-0000015,P-0000015-T01-IM3,Breast Cancer,Metastasis,Female,Breast,Liver,,,pathology,chest_breast,chest,breast,abdomen_liver,abdomen,liver,,,LIVER,Liver,LIVER
2,P-0000024,P-0000024-T01-IM3,Endometrial Cancer,Metastasis,Female,Uterus,Lung,,,pathology,pelvis_uterus,pelvis,uterus,chest_lung,chest,lung,,,LUNG,Lung,LUNG


### Combine mapping from ICD billing and IMPACT clinical samples

In [25]:
cols_keep1 = list(set.intersection(set(df_met_rdn_anno.columns), set(df_met_sites_impact.columns)))

cols_keep = [x for x in list(df_met_rdn_anno.columns) if x in cols_keep1]
cols_keep

['SAMPLE_ID',
 'DMP_ID',
 'CANCER_TYPE',
 'SEX',
 'PRIMARY_SITE_RDN_MAP',
 'Diagnosis Date',
 'ICD-9/10 Dx Code',
 'Diagnosis',
 'METASTATIC_SITE_RDN_MAP',
 'LYMPH_SPREAD',
 'LOCAL_EXTENSION',
 'hematogenous_grouping',
 'METASTATIC_SITE_ONCOTREE_RDN',
 'METASTATIC_SITE_BILLING_RDN']

In [26]:
# Subset dataframes
df_met_sites_impact2 = df_met_sites_impact[cols_keep]
df_met_rdn_anno1 = df_met_rdn_anno[cols_keep]

# Add source annotation
df_met_sites_impact2 = df_met_sites_impact2.assign(SOURCE='Pathology')
df_met_rdn_anno1 = df_met_rdn_anno1.assign(SOURCE='ICD Billing')

df_f = pd.concat([df_met_sites_impact2, df_met_rdn_anno1], axis=0, sort=False)

df_f

Unnamed: 0,SAMPLE_ID,DMP_ID,CANCER_TYPE,SEX,PRIMARY_SITE_RDN_MAP,Diagnosis Date,ICD-9/10 Dx Code,Diagnosis,METASTATIC_SITE_RDN_MAP,LYMPH_SPREAD,LOCAL_EXTENSION,hematogenous_grouping,METASTATIC_SITE_ONCOTREE_RDN,METASTATIC_SITE_BILLING_RDN,SOURCE
0,P-0000012-T03-IM3,P-0000012,Non-Small Cell Lung Cancer,Female,chest_lung,,,pathology,neck_unknown,,,NON_PORTAL,Head and Neck,HEAD_AND_NECK,Pathology
1,P-0000015-T01-IM3,P-0000015,Breast Cancer,Female,chest_breast,,,pathology,abdomen_liver,,,LIVER,Liver,LIVER,Pathology
2,P-0000024-T01-IM3,P-0000024,Endometrial Cancer,Female,pelvis_uterus,,,pathology,chest_lung,,,LUNG,Lung,LUNG,Pathology
3,P-0000025-T02-IM5,P-0000025,Endometrial Cancer,Female,pelvis_uterus,,,pathology,abdomen_peritoneum.serosa,,LOCAL,PORTAL,Peritoneum,PERITONEUM,Pathology
4,P-0000025-T03-IM6,P-0000025,Endometrial Cancer,Female,pelvis_uterus,,,pathology,abdomen_liver,,,LIVER,Liver,LIVER,Pathology
5,P-0000026-T01-IM3,P-0000026,Endometrial Cancer,Female,pelvis_uterus,,,pathology,pelvis_unknown,,,NON_PORTAL,,OTHER,Pathology
6,P-0000030-T01-IM3,P-0000030,Non-Small Cell Lung Cancer,Female,chest_lung,,,pathology,undeterminate_node,,,NON_PORTAL,Lymphatic,LYMPH,Pathology
7,P-0000037-T01-IM3,P-0000037,Hepatobiliary Cancer,Male,abdomen_liver,,,pathology,abdomen_liver,,LOCAL,LIVER,Liver,LIVER,Pathology
8,P-0000043-T02-IM3,P-0000043,Bladder Cancer,Unknown,pelvis_bladder,,,pathology,abdomen_liver,,,LIVER,Liver,LIVER,Pathology
9,P-0000058-T01-IM3,P-0000058,Breast Cancer,Female,chest_breast,,,pathology,abdomen_liver,,,LIVER,Liver,LIVER,Pathology


In [27]:
df_f[df_f['METASTATIC_SITE_BILLING_RDN'].isnull() & (df_f['SOURCE'] == 'Billing')]

Unnamed: 0,SAMPLE_ID,DMP_ID,CANCER_TYPE,SEX,PRIMARY_SITE_RDN_MAP,Diagnosis Date,ICD-9/10 Dx Code,Diagnosis,METASTATIC_SITE_RDN_MAP,LYMPH_SPREAD,LOCAL_EXTENSION,hematogenous_grouping,METASTATIC_SITE_ONCOTREE_RDN,METASTATIC_SITE_BILLING_RDN,SOURCE


## Save annotations

In [28]:
## Save annotations
df_f.to_csv(fname_save_anno, index=False)

## Create Binary Matrix from Mapping

In [29]:
df_binary = obj_mapping.create_binary_met_sites(df=df_f, col_index='SAMPLE_ID', col_count='DMP_ID', col_met_site='METASTATIC_SITE_BILLING_RDN')


In [30]:
df_binary.head(100)

METASTATIC_SITE_BILLING_RDN,SAMPLE_ID,HAS_MET_ADRENAL_GLAND,HAS_MET_BILIARY_TRACT,HAS_MET_BLADDER_OR_URINARY_TRACT,HAS_MET_BONE,HAS_MET_BOWEL,HAS_MET_BREAST,HAS_MET_CNS_BRAIN,HAS_MET_DIST_LYMPH,HAS_MET_GENITAL_FEMALE,HAS_MET_GENITAL_MALE,HAS_MET_HEAD_AND_NECK,HAS_MET_KIDNEY,HAS_MET_LIVER,HAS_MET_LUNG,HAS_MET_LYMPH,HAS_MET_MEDIASTINUM,HAS_MET_OTHER,HAS_MET_OVARY,HAS_MET_PERIPHERAL_NERVOUS_SYSTEM,HAS_MET_PERITONEUM,HAS_MET_PLEURA,HAS_MET_REGIONAL_LYMPH,HAS_MET_SKIN
0,P-0000004-T01-IM3,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,P-0000012-T02-IM3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
2,P-0000012-T03-IM3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0
3,P-0000012-T04-IM6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0
4,P-0000015-T01-IM3,0,0,0,1,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,1,1,0
5,P-0000023-T01-IM3,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,1,0
6,P-0000024-T01-IM3,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,1,0
7,P-0000025-T01-IM3,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1,0,1,0,0,0
8,P-0000025-T02-IM5,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1,0,1,0,0,0
9,P-0000025-T03-IM6,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1,0,1,0,0,0
