# metatrop_metastatic_sites_impact.py

### By Chris Fong - MSKCC 2020

This script will create a standardized table of metastatic sites for for a given 
- Clinical sample table downloaded from the GENIE V9 dataset 

The script can compute if a distant LN metastasis occurred for that patient, given their cancer type according to IMPACT data. Similarly, local extension is computed for viseral organs.

If a patient has multiple dx from IMPACT, more than 1 row will exist for that patient, and should be removed.



## Load Libraries

In [4]:
import sys  
sys.path.insert(0, '../mappings')
sys.path.insert(0, '../analysis')
sys.path.insert(0, '../')
import os
import pandas as pd
import constants_o_sites as const
from organ_mapping_analysis import OrganMappingAnalysisRND
from organ_mapping_rdn_processing import MetastaticSpreadMappingRND

In [5]:
# Console settings
def set_debug_console():
    desired_width = 320
    pd.set_option('display.width', desired_width)
    pd.set_option('display.max_rows', 250)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)
    pd.set_option('display.expand_frame_repr', False)

set_debug_console()

## Load Data

In [7]:
# Filename for output
path_save = '/Users/fongc2/Documents/github/MSK/clinical_data_mining/organ-site-mapping/metastatic_tropism_work'
fname_save_anno = 'metatrop_met_site_annotations_impact_20201029.csv'
pathfilename_save1 = os.path.join(path_save, fname_save_anno)
fname_save_anno = 'metatrop_met_site_annotations_impact_binary_20201029.csv'
pathfilename_save2 = os.path.join(path_save, fname_save_anno)
print(pathfilename_save1)
print(pathfilename_save2)

/Users/fongc2/Documents/github/MSK/clinical_data_mining/organ-site-mapping/metastatic_tropism_work/metatrop_met_site_annotations_impact_20201029.csv
/Users/fongc2/Documents/github/MSK/clinical_data_mining/organ-site-mapping/metastatic_tropism_work/metatrop_met_site_annotations_impact_binary_20201029.csv


##### Load IMPACT data from cbioportal to obtain primary and metastatic site info

In [10]:
fname = 'mskimpact_clinical_data_sample.csv'
path = '/Users/fongc2/Documents/github/MSK/DARWIN_ETL/data/'
pathfilename1 = os.path.join(path, fname)
df_samples1 = pd.read_csv(pathfilename1, header=0, low_memory=False, sep=',')

In [11]:
df_samples1.head()

Unnamed: 0,STUDY_ID,DMP_ID,SAMPLE_ID,ARCHER_PANEL,CANCER_TYPE,CANCER_TYPE_DETAILED,MUTATION_RATE,DATE_ADDED,ETHNICITY,GENE_PANEL,METASTATIC_SITE,MSI_SCORE,MSI_TYPE,MUTATION_COUNT,ONCOTREE_CODE,12_245_Part_A_Consented,12_245_Part_C_Consented,PRIMARY_SITE,RACE,SAMPLE_CLASS,NUM_IMPACT_SAMPLES,SAMPLE_COVERAGE,SAMPLE_TYPE,GENDER,SOMATIC_STATUS,SO_COMMENTS,WEEK_ADDED
0,mskimpact,P-0000004,P-0000004-T01-IM3,NO,Breast Cancer,Breast Invasive Ductal Carcinoma,4.5,2015/04/07,Non-Spanish; Non-Hispanic,IMPACT341,,2.5,Stable,4,IDC,YES,NO,Breast,WHITE,Tumor,1,428.0,Primary,Female,Matched,,"2015, Wk. 15"
1,mskimpact,P-0000012,P-0000012-T02-IM3,NO,Breast Cancer,Breast Invasive Ductal Carcinoma,1.1,2015/04/07,Non-Spanish; Non-Hispanic,IMPACT341,,4.1,Indeterminate,1,IDC,YES,NO,Breast,WHITE,Tumor,3,344.0,Primary,Female,Matched,,"2015, Wk. 15"
2,mskimpact,P-0000012,P-0000012-T03-IM3,NO,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,33.5,2015/04/07,Non-Spanish; Non-Hispanic,IMPACT341,Neck,0.47,Stable,30,LUAD,YES,NO,Lung,WHITE,Tumor,3,428.0,Metastasis,Female,Matched,,"2015, Wk. 15"
3,mskimpact,P-0000012,P-0000012-T04-IM6,YES,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,7.9,2018/08/01,Non-Spanish; Non-Hispanic,IMPACT468,,0.2,Stable,9,LUAD,YES,NO,Lung,WHITE,Tumor,3,713.0,Primary,Female,Matched,Note: The mutations and copy number profile su...,"2018, Wk. 31"
4,mskimpact,P-0000015,P-0000015-T01-IM3,NO,Breast Cancer,Breast Invasive Ductal Carcinoma,7.8,2015/04/07,Non-Spanish; Non-Hispanic,IMPACT341,Liver,2.55,Stable,7,IDC,YES,NO,Breast,WHITE,Tumor,1,281.0,Metastasis,Female,Matched,,"2015, Wk. 15"


##### Create dataframe for metastatic sites only from clinical sample file

In [13]:
# For genie, fix ids
col_id = 'SAMPLE_ID'
col_id2 = 'DMP_ID'
col_ct = 'CANCER_TYPE'
col_sex = 'SEX'
col_prim_site = 'PRIMARY_SITE'
col_met_site = 'METASTATIC_SITE'
col_sample_type = 'SAMPLE_TYPE'
col_rep = {'Patient ID': col_id2,
           'Sample ID': col_id,
           'GENDER': col_sex,
           'Cancer Type': col_ct,
           'Primary Tumor Site': col_prim_site,
           'Metastatic Site': col_met_site,
           'Sample Type': col_sample_type}
df_samples1 = df_samples1.rename(columns=col_rep)
df_samples = df_samples1[list(col_rep.values())]
df_metatrop_met = df_samples[df_samples[col_sample_type] == 'Metastasis']

In [14]:
df_metatrop_met.head()
# df_metatrop_met.shape

Unnamed: 0,DMP_ID,SAMPLE_ID,SEX,CANCER_TYPE,PRIMARY_SITE,METASTATIC_SITE,SAMPLE_TYPE
2,P-0000012,P-0000012-T03-IM3,Female,Non-Small Cell Lung Cancer,Lung,Neck,Metastasis
4,P-0000015,P-0000015-T01-IM3,Female,Breast Cancer,Breast,Liver,Metastasis
6,P-0000024,P-0000024-T01-IM3,Female,Endometrial Cancer,Uterus,Lung,Metastasis
8,P-0000025,P-0000025-T02-IM5,Female,Endometrial Cancer,Uterus,Peritoneum,Metastasis
9,P-0000025,P-0000025-T03-IM6,Female,Endometrial Cancer,Uterus,Liver,Metastasis


## Load Mapping Objects

### Load mapping tables

In [15]:
# Load RDN mapping
obj_met_map_rdn = MetastaticSpreadMappingRND(path='../' + const.pathname,
                                             fname_all_sites=const.fname_mapping_rdn_all_sites,
                                             fname_hematogenous=const.fname_mapping_rdn_hematogenous,
                                             fname_localext=const.fname_mapping_rdn_localext,
                                             fname_lymphatic=const.fname_mapping_rdn_lymphatic,
                                             fname_site_map=const.fname_mapping_rdn_site_map,
                                             fname_billing_map=const.fname_mapping_rdn_billing_map,
                                             fname_billing_code_dict=const.fname_mapping_rdn_to_billing_codes)

Loading mapping tables


In [16]:
const.pathname

'mappings'

### Load mapping object

In [17]:
# Load annoations object
obj_mapping = OrganMappingAnalysisRND(obj_met_map=obj_met_map_rdn)

## Create Mapping on Clinical Samples Table from cBioPortal

In [18]:
# Annotate IMPACT sample site data
df_met_sites_impact = obj_mapping.annotate_mapping_impact_met_samples(df_samples=df_metatrop_met, 
                                                                      col_primary_site=col_prim_site, 
                                                                      col_met_site=col_met_site, 
                                                                      label_dist_ln=True)

df_met_sites_impact.head()

Unnamed: 0,DMP_ID,SAMPLE_ID,SEX,CANCER_TYPE,PRIMARY_SITE,METASTATIC_SITE,SAMPLE_TYPE,PRIMARY_SITE_RDN_MAP,PRIMARY_SITE_RDN_MAP_MAIN,PRIMARY_SITE_RDN_MAP_SECONDARY,METASTATIC_SITE_RDN_MAP,METASTATIC_SITE_RDN_MAP_MAIN,METASTATIC_SITE_RDN_MAP_SECONDARY,LYMPH_SPREAD,LOCAL_EXTENSION,hematogenous_grouping,METASTATIC_SITE_ONCOTREE_RDN,METASTATIC_SITE_BILLING_RDN
0,P-0000012,P-0000012-T03-IM3,Female,Non-Small Cell Lung Cancer,Lung,Neck,Metastasis,chest_lung,chest,lung,neck_unknown,neck,unknown,,,NON_PORTAL,Head and Neck,HEAD_AND_NECK
1,P-0000015,P-0000015-T01-IM3,Female,Breast Cancer,Breast,Liver,Metastasis,chest_breast,chest,breast,abdomen_liver,abdomen,liver,,,LIVER,Liver,LIVER
2,P-0000024,P-0000024-T01-IM3,Female,Endometrial Cancer,Uterus,Lung,Metastasis,pelvis_uterus,pelvis,uterus,chest_lung,chest,lung,,,LUNG,Lung,LUNG
3,P-0000025,P-0000025-T02-IM5,Female,Endometrial Cancer,Uterus,Peritoneum,Metastasis,pelvis_uterus,pelvis,uterus,abdomen_peritoneum.serosa,abdomen,peritoneum.serosa,,LOCAL,PORTAL,Peritoneum,PERITONEUM
4,P-0000025,P-0000025-T03-IM6,Female,Endometrial Cancer,Uterus,Liver,Metastasis,pelvis_uterus,pelvis,uterus,abdomen_liver,abdomen,liver,,,LIVER,Liver,LIVER


In [19]:
# Save RDN annotations
df_met_sites_impact.to_csv(pathfilename_save1, index=False)

## Create binary matrix from mapping

In [20]:
df_binary = obj_mapping.create_binary_met_sites(df=df_met_sites_impact, 
                                                col_index='SAMPLE_ID', 
                                                col_count='DMP_ID', 
                                                col_met_site='METASTATIC_SITE_BILLING_RDN')

In [21]:
df_binary.head(50)

METASTATIC_SITE_BILLING_RDN,SAMPLE_ID,HAS_MET_ADRENAL_GLAND,HAS_MET_BILIARY_TRACT,HAS_MET_BLADDER_OR_URINARY_TRACT,HAS_MET_BONE,HAS_MET_BOWEL,HAS_MET_BREAST,HAS_MET_CNS_BRAIN,HAS_MET_DIST_LYMPH,HAS_MET_GENITAL_FEMALE,HAS_MET_GENITAL_MALE,HAS_MET_HEAD_AND_NECK,HAS_MET_KIDNEY,HAS_MET_LIVER,HAS_MET_LUNG,HAS_MET_LYMPH,HAS_MET_MEDIASTINUM,HAS_MET_OTHER,HAS_MET_OVARY,HAS_MET_PERITONEUM,HAS_MET_PLEURA,HAS_MET_REGIONAL_LYMPH,HAS_MET_SKIN
0,P-0000012-T03-IM3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,P-0000015-T01-IM3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,P-0000024-T01-IM3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,P-0000025-T02-IM5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,P-0000025-T03-IM6,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
5,P-0000026-T01-IM3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
6,P-0000030-T01-IM3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
7,P-0000037-T01-IM3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
8,P-0000043-T02-IM3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
9,P-0000058-T01-IM3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


## Save annotations

In [22]:
# Save RDN annotations
df_binary.to_csv(pathfilename_save2, index=False)

In [14]:
cols = list(df_binary.columns[df_binary.columns.str.contains('HAS_')])
(df_binary[cols].sum()/df_binary.shape[0]).sort_values(ascending=False)

METASTATIC_SITE_BILLING_RDN
HAS_MET_LIVER                       0.229387
HAS_MET_LYMPH                       0.218182
HAS_MET_LUNG                        0.119873
HAS_MET_OTHER                       0.110359
HAS_MET_BONE                        0.076744
HAS_MET_PERITONEUM                  0.045455
HAS_MET_CNS_BRAIN                   0.041015
HAS_MET_PLEURA                      0.036998
HAS_MET_BOWEL                       0.021776
HAS_MET_SKIN                        0.019027
HAS_MET_OVARY                       0.017548
HAS_MET_ADRENAL_GLAND               0.012474
HAS_MET_MEDIASTINUM                 0.009725
HAS_MET_HEAD_AND_NECK               0.008457
HAS_MET_BLADDER_OR_URINARY_TRACT    0.006765
HAS_MET_REGIONAL_LYMPH              0.006554
HAS_MET_GENITAL_FEMALE              0.005920
HAS_MET_BILIARY_TRACT               0.004228
HAS_MET_DIST_LYMPH                  0.003805
HAS_MET_KIDNEY                      0.003805
HAS_MET_BREAST                      0.003594
HAS_MET_GENITAL_MALE       