# met_site_data_creation_clinical.py

### By Chris Fong - MSKCC 2020

This script will create a standardized table of metastatic sites for for a given 
- Diagnosis table containing ICD billing codes and 
- Clinical sample table downloaded from the cohort tab in cBioPortal 

The script can compute if a distant LN metastasis occurred for that patient, given their cancer type according to IMPACT data. Similarly, local extension is computed for viseral organs.

If a patient has multiple dx from IMPACT, more than 1 row will exist for that patient, and should be removed.



## Load Libraries

In [1]:
import sys  
sys.path.insert(0, '../mappings')
sys.path.insert(0, '../analysis')
sys.path.insert(0, '../')
import os
import pandas as pd
import constants_o_sites as const
from organ_mapping_analysis import OrganMappingAnalysisRND
from organ_mapping_rdn_processing import MetastaticSpreadMappingRND

In [2]:
# Console settings
def set_debug_console():
    desired_width = 320
    pd.set_option('display.width', desired_width)
    pd.set_option('display.max_rows', 250)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)
    pd.set_option('display.expand_frame_repr', False)

set_debug_console()

## Load Data

### Define columns being used

In [3]:
## Load Data
path = '../demo_data'
fname_save_anno = 'met_site_annotations_clinical.csv'
pathfilename_save = os.path.join(path, fname_save_anno)

col_icd_billing = 'ICD-9/10 Dx Code'
col_id = 'SAMPLE_ID'
col_id2 = 'DMP_ID'
col_ct = 'CANCER_TYPE'
col_sex = 'SEX'
col_prim_site = 'PRIMARY_SITE'
col_met_site = 'METASTATIC_SITE'
col_sample_type = 'SAMPLE_TYPE'

### Load clinical sample file from cbioportal

In [4]:
## Load cbioportal clinical data file --------------
# Load ID names
fname = 'msk_impact_2017_clinical_data.tsv'
pathfilename1 = os.path.join(path, fname)
df_samples1 = pd.read_csv(pathfilename1, header=0, low_memory=False, sep='\t')

In [5]:
# For genie, fix ids
col_rep = {'Patient ID': col_id2,
           'Sample ID': col_id,
           'Sex': col_sex,
           'Cancer Type': col_ct,
           'Primary Tumor Site': col_prim_site,
           'Metastatic Site': col_met_site,
           'Sample Type': col_sample_type}
df_samples1 = df_samples1.rename(columns=col_rep)
df_samples = df_samples1[list(col_rep.values())]

In [6]:
df_samples.head()

Unnamed: 0,SAMPLE_ID,SEX,METASTATIC_SITE,CANCER_TYPE,PRIMARY_SITE,DMP_ID,SAMPLE_TYPE
0,P-0000004-T01-IM3,Female,,Breast Cancer,Breast,P-0000004,Primary
1,P-0000015-T01-IM3,Female,Liver,Breast Cancer,Breast,P-0000015,Metastasis
2,P-0000023-T01-IM3,Male,,Mesothelioma,Peritoneum,P-0000023,Primary
3,P-0000024-T01-IM3,Female,Lung,Endometrial Cancer,Uterus,P-0000024,Metastasis
4,P-0000025-T01-IM3,Female,,Endometrial Cancer,Uterus,P-0000025,Primary


### Load ICD Billing Patient Diagnosis Data 

In [7]:
# Load met sites directly from diagnosis
fname = 'table_diagnosis_clean_demo.csv'
pathfilename1 = os.path.join(path, fname)
df_dx_mets = pd.read_csv(pathfilename1, header=0, low_memory=False, sep=',')

In [8]:
df_dx_mets.head()

Unnamed: 0,DMP_ID,ICD-9/10 Dx Code,DX_DESCRIPTION,SEX
0,P-0000004,197.7,[u'197.7 - LIVER METS'],Female
1,P-0000004,198.5,[u'198.5 - BONE METS'],Female
2,P-0000012,196.0,[u'196.0 - LYMPH NODE METS-NECK'],
3,P-0000012,196.1,[u'196.1 - LYMPH NODE METS-INTRATHOR'],
4,P-0000012,197.6,[u'197.6 - MET CA-PERITONEUM'],


## Load Mapping Objects

### Load mapping tables

In [9]:
## Load Mapping Objects### Load mapping tables# Load RDN mapping
obj_met_map_rdn = MetastaticSpreadMappingRND(path='../' + const.pathname,
                                             fname_all_sites=const.fname_mapping_rdn_all_sites,
                                             fname_hematogenous=const.fname_mapping_rdn_hematogenous,
                                             fname_localext=const.fname_mapping_rdn_localext,
                                             fname_lymphatic=const.fname_mapping_rdn_lymphatic,
                                             fname_site_map=const.fname_mapping_rdn_site_map,
                                             fname_billing_map=const.fname_mapping_rdn_billing_map,
                                             fname_billing_code_dict=const.fname_mapping_rdn_to_billing_codes)

Loading mapping tables


### Load mapping object

In [10]:
# Load annoations object
obj_mapping = OrganMappingAnalysisRND(obj_met_map=obj_met_map_rdn)

## Create Mapping based on Clinical Sample Table (Primary Site) and ICD Billing Codes (Metastatic Site)

In [11]:
# Annotate diagnosis table of ICD billings with renzo's mapping of metastatic sites
df_met_sites_dx = obj_mapping.annotate_icd_billing_met_dx(df_dx_mets=df_dx_mets, col_icd_billing=col_icd_billing, col_sex=col_sex)


df_met_rdn_anno = obj_mapping.create_sample_to_icd_billing_met_mapping(df_dx=df_dx_mets,
                                                                       col_icd_billing=col_icd_billing,
                                                                       col_sex=col_sex,
                                                             ## Create Mapping on Clinical Samples Table from cBioPortal          col_sex=col_sex, 
                                                                       df_samples=df_samples,
                                                                       col_primary_site=col_prim_site,
                                                                       col_met_site=col_met_site,
                                                                       label_dist_ln=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_dx_mets2.loc[log_sex_null, 'METASTATIC_SITE_RDN_MAP'] = pd.np.NaN


In [12]:
df_met_rdn_anno.head()

Unnamed: 0,SAMPLE_ID,DMP_ID,CANCER_TYPE,PRIMARY_SITE,METASTATIC_SITE,SEX,PRIMARY_SITE_RDN_MAP,ICD-9/10 Dx Code,DX_DESCRIPTION,METASTATIC_SITE_RDN_MAP,LYMPH_SPREAD,LOCAL_EXTENSION,hematogenous_grouping,METASTATIC_SITE_ONCOTREE_RDN,METASTATIC_SITE_BILLING_RDN
0,P-0000004-T01-IM3,P-0000004,Breast Cancer,Breast,,Female,chest_breast,197.7,[u'197.7 - LIVER METS'],abdomen_liver,,,LIVER,Liver,LIVER
1,P-0000004-T01-IM3,P-0000004,Breast Cancer,Breast,,Female,chest_breast,198.5,[u'198.5 - BONE METS'],undeterminate_bone,,,NON_PORTAL,Bone,BONE
2,P-0000015-T01-IM3,P-0000015,Breast Cancer,Breast,Liver,Female,chest_breast,197.0,[u'197.0 - LUNG METS'],chest_lung,,,LUNG,Lung,LUNG
3,P-0000015-T01-IM3,P-0000015,Breast Cancer,Breast,Liver,Female,chest_breast,198.5,[u'198.5 - BONE METS'],undeterminate_bone,,,NON_PORTAL,Bone,BONE
4,P-0000015-T01-IM3,P-0000015,Breast Cancer,Breast,Liver,Female,chest_breast,197.2,[u'197.2 - MET CA-PLEURA'],chest_pleura.serosa,,,NON_PORTAL,Pleura,PLEURA


## Save annotations

In [13]:
## Save annotations
df_met_rdn_anno.to_csv(pathfilename_save, index=False)

## Create Binary Matrix from Mapping

In [14]:
df_binary = obj_mapping.create_binary_met_sites(df=df_met_rdn_anno, col_index='SAMPLE_ID', col_count='DMP_ID', col_met_site='METASTATIC_SITE_BILLING_RDN')


In [15]:
df_binary.head(100)

METASTATIC_SITE_BILLING_RDN,SAMPLE_ID,HAS_MET_ADRENAL_GLAND,HAS_MET_BILIARY_TRACT,HAS_MET_BLADDER_OR_URINARY_TRACT,HAS_MET_BONE,HAS_MET_BOWEL,HAS_MET_BREAST,HAS_MET_CNS_BRAIN,HAS_MET_DIST_LYMPH,HAS_MET_GENITAL_FEMALE,HAS_MET_GENITAL_MALE,HAS_MET_HEAD_AND_NECK,HAS_MET_KIDNEY,HAS_MET_LIVER,HAS_MET_LUNG,HAS_MET_LYMPH,HAS_MET_MEDIASTINUM,HAS_MET_OTHER,HAS_MET_OVARY,HAS_MET_PERIPHERAL_NERVOUS_SYSTEM,HAS_MET_PERITONEUM,HAS_MET_PLEURA,HAS_MET_REGIONAL_LYMPH,HAS_MET_SKIN
0,P-0000004-T01-IM3,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,P-0000015-T01-IM3,0,0,0,1,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,1,1,0
2,P-0000023-T01-IM3,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,1,0
3,P-0000024-T01-IM3,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,1,0
4,P-0000025-T01-IM3,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1,0,1,0,0,0
5,P-0000025-T02-IM5,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1,0,1,0,0,0
6,P-0000026-T01-IM3,0,0,0,0,1,0,0,1,1,0,0,0,1,1,0,0,1,0,0,1,0,1,0
7,P-0000027-T01-IM3,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0
8,P-0000030-T01-IM3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
9,P-0000034-T01-IM3,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0
