# met_site_data_creation_impact_scaled.py

### By Chris Fong - MSKCC 2022

This script will create a standardized table of metastatic sites for for a given 
- Clinical sample table downloaded from the cohort tab in cBioPortal 

This script will leveerage RDN's metastatic mapping from IMPACT patient cancer types to distant, lymphatic, regional metastatic disease

This script will ONLY cover metastatic samples from IMPACT cohort. ALL metastatic disease sites are not covered



## Load Libraries

In [1]:
import sys  
sys.path.insert(0, '../mappings')
sys.path.insert(0, '../analysis')
sys.path.insert(0, '../')
import os
import pandas as pd
import constants_o_sites as const
from organ_mapping_analysis import OrganMappingAnalysisRND
from organ_mapping_rdn_processing import MetastaticSpreadMappingRND

from minio import Minio
import urllib3
sys.path.insert(0, '/mind_data/fongc2/data-curation/')
from utils import set_debug_console, print_df_without_index, drop_cols, mrn_zero_pad, convert_to_int

from dotenv import load_dotenv, find_dotenv, dotenv_values

from pathlib import Path


In [2]:
set_debug_console()

## Minio Setup

In [3]:
load_dotenv(find_dotenv())
 
ACCESS_KEY = os.getenv("ACCESS_KEY")
SECRET_KEY = os.getenv('SECRET_KEY')
BUCKET = 'cdm-data'

In [4]:
# required for self-signed certs
httpClient = urllib3.PoolManager(
                cert_reqs='CERT_REQUIRED',
                ca_certs='/mind_data/fongc2/certificate.crt'
            )

In [5]:
# Create secure client with access key and secret key
client = Minio(
    "tllihpcmind6:9000",
    access_key=ACCESS_KEY,
    secret_key=SECRET_KEY,
    secure=True,
    http_client=httpClient
)

In [6]:
objs = client.list_objects(bucket_name=BUCKET,recursive=True, prefix="summ")
for obj in objs:
    print(obj.object_name)


summary/.ipynb_checkpoints/IMPACT_Darwin_Patient_Summary-checkpoint.tsv
summary/.ipynb_checkpoints/IMPACT_Darwin_Sample_Summary-checkpoint.tsv
summary/IMPACT_Darwin_Patient_Summary.tsv
summary/IMPACT_Darwin_Sample_Summary.tsv


## Outputs

This script will create a cBioPortal formatted timeline file for metastatic site events
Events will be derived from four sources:
- Billing codes
- IMPACT sample info
- ICD-O registry
- Radiology reports (NLP predictions)

cBioPortal timeline file columns required:
- PATIENT_ID
- START_DATE
- STOP_DATE (For this file, can be left blank)
- EVENT_TYPE (Always "Diagnosis")
- SUBTYPE (For metastatic events, always "Metastasis")
- SOURCE (ICD Billing, MSK-IMPACT, Radiology Report)
- SOURCE_SPECIFIC (Imaging modality: CT, PET, MRI for radiology reports. Pathology for MSK-IMPACT. ICD Billing for billing codes.)
- DX_DESCRIPTION (ICD billing description, "Radiology Report", Pathology report specimen description (?))
- ANATOMIC_LOCATION (MSK-MET derived info)


In [78]:
col_order = ['PATIENT_ID', 
              'START_DATE', 
              'STOP_DATE',
              'EVENT_TYPE',
              'SUBTYPE',
              'SOURCE',
              'SOURCE_SPECIFIC',
              'DX_DESCRIPTION',
              'ANATOMIC_LOCATION']

### Filename for output

In [7]:
path = '../demo_data'
fname_save_anno = 'impact2017_met_site_annotations_impact_scaled.csv'
pathfilename_out = os.path.join(path, fname_save_anno)

## Load Data
- Demographics (for gender)
- ID mapping between dmp-id and mrn
- IMPACT sample data
- IMPACT sample summary (for DOP)
- Dx Timeline from Darwin
- Radiology report predictions

### Load cbioportal clinical data file

In [8]:
obj = client.get_object(BUCKET,'cbioportal/mskimpact_ids.tsv')
df_samples1 = pd.read_csv(obj, sep='\t', low_memory=False)

# For genie, fix ids
col_id = 'SAMPLE_ID'
col_id2 = 'DMP_ID'
col_sex = 'GENDER'
col_sample_type = 'SAMPLE_TYPE'
col_prim_site = 'PRIMARY_SITE'
col_met_site = 'METASTATIC_SITE'
col_icd_billing = 'ICD-9/10 Dx Code'


col_rep = {'patientId': col_id2,
           'sampleId': col_id}

cols_keep = ['SAMPLE_ID',
             'DMP_ID',
             'CANCER_TYPE',
             'SAMPLE_TYPE',
             'PRIMARY_SITE',
             'METASTATIC_SITE'
            ]

df_samples = df_samples1.rename(columns=col_rep)
# df_samples = df_samples1[list(col_rep.values())]
logic1 = df_samples[col_sample_type] == 'Metastasis'
df_samples = df_samples.loc[logic1, cols_keep]


In [9]:
df_samples.head(1)


Unnamed: 0,SAMPLE_ID,DMP_ID,CANCER_TYPE,SAMPLE_TYPE,PRIMARY_SITE,METASTATIC_SITE
2,P-0000012-T03-IM3,P-0000012,Non-Small Cell Lung Cancer,Metastasis,Lung,Neck


### Load demographics

In [10]:
obj = client.get_object('cdm-data','demographics/ddp_demographics.tsv')
df_demo = pd.read_csv(obj, sep='\t', low_memory=False)
df_demo = mrn_zero_pad(df=df_demo, col_mrn='MRN')
df_demo = df_demo[['MRN', 'PT_BIRTH_DTE', 'GENDER']]

df_demo['PT_BIRTH_DTE'] = pd.to_datetime(df_demo['PT_BIRTH_DTE'])



In [110]:
df_demo.head(1);


### Load sample summary data

In [12]:
obj = client.get_object('cdm-data','summary/IMPACT_Darwin_Sample_Summary.tsv')
df_sample_summary = pd.read_csv(obj, sep='\t', low_memory=False)

df_sample_summary = convert_to_int(df=df_sample_summary, list_cols=['MRN'])
df_sample_summary = mrn_zero_pad(df=df_sample_summary, col_mrn='MRN')

cols_keep = ['MRN', 'SAMPLE_ID', 'DMP_ID', 'DATE_OF_PROCEDURE_SURGICAL_EST']
df_sample_summary = df_sample_summary[cols_keep]

df_sample_summary['DATE_OF_PROCEDURE_SURGICAL_EST'] = pd.to_datetime(df_sample_summary['DATE_OF_PROCEDURE_SURGICAL_EST'])

In [111]:
df_sample_summary;

### Load ID mapping

In [14]:
obj = client.get_object('cdm-data','id_mapping/ddp_id_mapping_pathology.tsv')
df_id_map = pd.read_csv(obj, sep='\t', low_memory=False)
df_id_map = mrn_zero_pad(df=df_id_map, col_mrn='MRN')
df_id_map = df_id_map[['MRN', 'DMP_ID']].drop_duplicates()
df_id_map = df_id_map[df_id_map['DMP_ID'].notnull()]



In [112]:
df_id_map.head(1);


### Load Dx timeline data

In [16]:
obj = client.get_object('cdm-data','diagnosis/table_diagnosis_clean.csv')
df_dx_timeline = pd.read_csv(obj, sep=',', low_memory=False)
df_dx_timeline = mrn_zero_pad(df=df_dx_timeline, col_mrn='MRN')

df_dx_timeline['Diagnosis Date'] = pd.to_datetime(df_dx_timeline['Diagnosis Date'])


#### Tumor Registry (Add this to Primary Dx timeline)

In [71]:
cols_dx_treg = ['MRN', 'Diagnosis Date', 
           'IS_STAGE_IV_ICDO_SOLID', 'ICDO_IS_DISTANT_SOLID']
df_dx_treg = df_dx_timeline.loc[df_dx_timeline['Source'] == 'Tumor Registry', cols_dx_treg]
logic = (df_dx_treg['IS_STAGE_IV_ICDO_SOLID'] == True) | (df_dx_treg['ICDO_IS_DISTANT_SOLID'] == True)
df_dx_treg_mets = df_dx_treg[logic]

df_dx_treg_mets = df_dx_treg_mets.drop(columns=['IS_STAGE_IV_ICDO_SOLID', 'ICDO_IS_DISTANT_SOLID'])

In [113]:
df_dx_treg_mets.head(1);


#### Billing codes

In [19]:
cols_dx_billing = ['MRN', 'ICD-9/10 Dx Code', 'Diagnosis Description', 'Diagnosis Date']

logic1 = df_dx_timeline['Diagnosis Type'] == 'METS'
logic2 = df_dx_timeline['Source'] == 'Billing'
logics = logic1 & logic2
df_dx_billing = df_dx_timeline.loc[logics, cols_dx_billing]


In [114]:
df_dx_billing.head(1);


### Load Radiology report met predictions

In [21]:
# TBD


## Merge and clean input data
- Add date of birth and sex/gender to dataframes
- Add impact IDs
- Select important columns

### Sample data

In [22]:
df_samples_f = df_sample_summary.merge(right=df_samples, how='right', on=['SAMPLE_ID', 'DMP_ID'])
df_samples_f = df_samples_f.merge(right=df_demo, how='left', on='MRN')

age_dop = (df_samples_f['DATE_OF_PROCEDURE_SURGICAL_EST'] - df_samples_f['PT_BIRTH_DTE']).dt.days
df_samples_f = df_samples_f.assign(AGE_DX=age_dop)

df_samples_f = df_samples_f[df_samples_f['AGE_DX'].notnull()]
df_samples_f['AGE_DX'] = df_samples_f['AGE_DX'].astype(int) 
df_samples_f = df_samples_f.drop(columns=['MRN', 'DATE_OF_PROCEDURE_SURGICAL_EST', 'PT_BIRTH_DTE'])


In [23]:
print(df_samples_f.shape)
df_samples_f.head(1)


(25105, 8)


Unnamed: 0,SAMPLE_ID,DMP_ID,CANCER_TYPE,SAMPLE_TYPE,PRIMARY_SITE,METASTATIC_SITE,GENDER,AGE_DX
0,P-0000012-T03-IM3,P-0000012,Non-Small Cell Lung Cancer,Metastasis,Lung,Neck,FEMALE,21180


### Billing code data

In [24]:
df_dx_billing_f = df_dx_billing.merge(right=df_demo, how='left', on='MRN')
# df_id_map = df_sample_summary[['DMP_ID', 'MRN']].drop_duplicates()
df_dx_billing_f = df_id_map.merge(right=df_dx_billing_f, how='right', on='MRN')

age_dx = (df_dx_billing_f['Diagnosis Date'] - df_dx_billing_f['PT_BIRTH_DTE']).dt.days
df_dx_billing_f = df_dx_billing_f.assign(AGE_DX=age_dx)

df_dx_billing_f = df_dx_billing_f.drop(columns=['Diagnosis Date', 'PT_BIRTH_DTE', 'MRN'])
df_dx_billing_f = df_dx_billing_f[df_dx_billing_f['DMP_ID'].notnull()]

In [25]:
df_dx_billing_f.head(1)

Unnamed: 0,DMP_ID,ICD-9/10 Dx Code,Diagnosis Description,GENDER,AGE_DX
0,P-0015526,C77.0,Miscellaneous,MALE,17267


### Tumor registry data

In [26]:
df_dx_treg_mets_f = df_dx_treg_mets.merge(right=df_demo, how='left', on='MRN')
# df_id_map = df_sample_summary[['DMP_ID', 'MRN']].drop_duplicates()
df_dx_treg_mets_f = df_id_map.merge(right=df_dx_treg_mets_f, how='right', on='MRN')

age_dx = (df_dx_treg_mets_f['Diagnosis Date'] - df_dx_treg_mets_f['PT_BIRTH_DTE']).dt.days
df_dx_treg_mets_f = df_dx_treg_mets_f.drop(columns=['MRN', 'GENDER', 'Diagnosis Date', 'PT_BIRTH_DTE'])
df_dx_treg_mets_f = df_dx_treg_mets_f[df_dx_treg_mets_f['DMP_ID'].notnull()]

df_dx_treg_mets_f = df_dx_treg_mets_f.assign(AGE_DX=age_dx)
df_dx_treg_mets_f = df_dx_treg_mets_f.assign(ORGAN_SITE='')
df_dx_treg_mets_f = df_dx_treg_mets_f.assign(SOURCE='Tumor Registry')


In [27]:
df_dx_treg_mets_f

Unnamed: 0,DMP_ID,AGE_DX,ORGAN_SITE,SOURCE
0,P-0005017,28466,,Tumor Registry
1,P-0038278,21217,,Tumor Registry
2,P-0005035,18573,,Tumor Registry
3,P-0011298,25025,,Tumor Registry
4,P-0007589,28988,,Tumor Registry
...,...,...,...,...
17837,P-0076963,15204,,Tumor Registry
17839,P-0077419,14929,,Tumor Registry
17840,P-0077735,23428,,Tumor Registry
17843,P-0079065,14531,,Tumor Registry


## Load Mapping Objects

### Load mapping tables

In [28]:
# Load RDN mapping
obj_met_map_rdn = MetastaticSpreadMappingRND(path='../' + const.pathname,
                                             fname_all_sites=const.fname_mapping_rdn_all_sites,
                                             fname_hematogenous=const.fname_mapping_rdn_hematogenous,
                                             fname_localext=const.fname_mapping_rdn_localext,
                                             fname_lymphatic=const.fname_mapping_rdn_lymphatic,
                                             fname_site_map=const.fname_mapping_rdn_site_map,
                                             fname_billing_map=const.fname_mapping_rdn_billing_map,
                                             fname_billing_code_dict=const.fname_mapping_rdn_to_billing_codes)


Loading mapping tables


In [29]:
const.pathname


'mappings'

### Load mapping object

In [30]:
# Load annoations object
obj_mapping = OrganMappingAnalysisRND(obj_met_map=obj_met_map_rdn)


In [31]:
obj_met_map_rdn.df_map_icd_mapping.head()


Unnamed: 0,tissue_icd_billing,clean_site,ICD_BILLING_MAPPING
0,ADRENAL_GLAND,site1,retroperitoneum_adrenal
1,BILIARY_TRACT,site1,abdomen_biliary
2,BLADDER_OR_URINARY_TRACT,site1,pelvis_bladder
3,BONE,site1,back_bone.spine
4,BOWEL,site1,abdomen_foregut


## Apply mappings and format frames into cBioPortal timeline format
### Clinical Samples Table from cBioPortal

In [81]:
# Annotate IMPACT sample site data
df_met_sites_impact = obj_mapping.annotate_mapping_impact_met_samples(df_samples=df_samples_f, 
                                                                      col_primary_site=col_prim_site, 
                                                                      col_met_site=col_met_site, 
                                                                      label_dist_ln=True)

cols_keep = ['DMP_ID', 'AGE_DX', 'METASTATIC_SITE_BILLING_RDN']
df_met_sites_impact = df_met_sites_impact[cols_keep]
df_met_sites_impact = df_met_sites_impact.assign(SOURCE='MSK-IMPACT')
df_met_sites_impact = df_met_sites_impact.assign(STOP_DATE='')
df_met_sites_impact = df_met_sites_impact.assign(EVENT_TYPE='Diagnosis')
df_met_sites_impact = df_met_sites_impact.assign(SUBTYPE='Metastasis')
df_met_sites_impact = df_met_sites_impact.assign(SOURCE='MSK-IMPACT')
df_met_sites_impact = df_met_sites_impact.assign(SOURCE_SPECIFIC='Pathology')
df_met_sites_impact = df_met_sites_impact.assign(DX_DESCRIPTION='Specimen Description (Under construction)')

df_met_sites_impact = df_met_sites_impact.rename(columns={'DMP_ID': 'PATIENT_ID',
                                                          'METASTATIC_SITE_BILLING_RDN': 'ANATOMIC_LOCATION',
                                                          'AGE_DX': 'START_DATE'
                                                         })

df_met_sites_impact = df_met_sites_impact[col_order]

In [82]:
df_met_sites_impact.head()

Unnamed: 0,PATIENT_ID,START_DATE,STOP_DATE,EVENT_TYPE,SUBTYPE,SOURCE,SOURCE_SPECIFIC,DX_DESCRIPTION,ANATOMIC_LOCATION
0,P-0000012,21180,,Diagnosis,Metastasis,MSK-IMPACT,Pathology,Specimen Description (Under construction),HEAD_AND_NECK
1,P-0000015,16226,,Diagnosis,Metastasis,MSK-IMPACT,Pathology,Specimen Description (Under construction),LIVER
2,P-0000024,22281,,Diagnosis,Metastasis,MSK-IMPACT,Pathology,Specimen Description (Under construction),LUNG
3,P-0000025,27264,,Diagnosis,Metastasis,MSK-IMPACT,Pathology,Specimen Description (Under construction),INTRA_ABDOMINAL
4,P-0000025,27600,,Diagnosis,Metastasis,MSK-IMPACT,Pathology,Specimen Description (Under construction),LIVER


### ICD Billing table 

In [100]:
# Annotate diagnosis table of ICD billings with renzo's mapping of metastatic sites
df_met_sites_dx = obj_mapping.annotate_icd_billing_met_dx(df_dx_mets=df_dx_billing_f, 
                                                          col_icd_billing=col_icd_billing, 
                                                          col_sex=col_sex,
                                                          add_msk_met_anno=True)

df_met_sites_dx['Diagnosis Description'] = df_met_sites_dx['ICD-9/10 Dx Code'] + ' - ' + df_met_sites_dx['Diagnosis Description']
df_met_sites_dx = df_met_sites_dx.drop(columns=['ICD-9/10 Dx Code', 'GENDER', 'METASTATIC_SITE_RDN_MAP'])

df_met_sites_dx = df_met_sites_dx.assign(SOURCE='ICD Billing')
df_met_sites_dx = df_met_sites_dx.assign(STOP_DATE='')
df_met_sites_dx = df_met_sites_dx.assign(EVENT_TYPE='Diagnosis')
df_met_sites_dx = df_met_sites_dx.assign(SOURCE_SPECIFIC='ICD Billing')

df_met_sites_dx = df_met_sites_dx.rename(columns={'DMP_ID': 'PATIENT_ID',
                                                  'tissue_icd_billing': 'ANATOMIC_LOCATION',
                                                  'AGE_DX': 'START_DATE',
                                                  'Diagnosis Description': 'DX_DESCRIPTION'
                                                  }
                                        )
df_met_sites_dx

Unnamed: 0,PATIENT_ID,DX_DESCRIPTION,START_DATE,ANATOMIC_LOCATION,SOURCE,STOP_DATE,EVENT_TYPE,SOURCE_SPECIFIC
0,P-0015526,C77.0 - Miscellaneous,17267,LYMPH,ICD Billing,,Diagnosis,ICD Billing
1,P-0001115,198.5 - METS - BONE METS,26857,BONE,ICD Billing,,Diagnosis,ICD Billing
2,P-0001115,196.6 - METS - LYMPH NODE METS-PELVIC,26927,LYMPH,ICD Billing,,Diagnosis,ICD Billing
3,P-0001115,197.6 - METS - MET CA-PERITONEUM,26806,INTRA_ABDOMINAL,ICD Billing,,Diagnosis,ICD Billing
4,P-0001115,198.89 - METS - MET CANCER,26211,OTHER,ICD Billing,,Diagnosis,ICD Billing
...,...,...,...,...,...,...,...,...
239727,P-0079188,C77.2 - Miscellaneous,25925,LYMPH,ICD Billing,,Diagnosis,ICD Billing
239728,P-0079188,C79.89 - Miscellaneous,25925,OTHER,ICD Billing,,Diagnosis,ICD Billing
239729,P-0079188,C78.4 - Miscellaneous,25925,BOWEL,ICD Billing,,Diagnosis,ICD Billing
239730,P-0079189,C79.89 - Miscellaneous,23573,OTHER,ICD Billing,,Diagnosis,ICD Billing


In [101]:
print(df_dx_billing_f.shape)
print(df_met_sites_dx.shape)


(239732, 5)
(239732, 8)


#### Non-LN organs

In [102]:
df_met_sites_dx_vis = df_met_sites_dx[~df_met_sites_dx['ANATOMIC_LOCATION'].isin(['LYMPH', 'OTHER'])]

df_met_sites_dx_vis = df_met_sites_dx_vis.assign(SUBTYPE='Metastasis')
df_met_sites_dx_vis = df_met_sites_dx_vis[col_order]

In [103]:
df_met_sites_dx_vis.head()

Unnamed: 0,PATIENT_ID,START_DATE,STOP_DATE,EVENT_TYPE,SUBTYPE,SOURCE,SOURCE_SPECIFIC,DX_DESCRIPTION,ANATOMIC_LOCATION
1,P-0001115,26857,,Diagnosis,Metastasis,ICD Billing,ICD Billing,198.5 - METS - BONE METS,BONE
3,P-0001115,26806,,Diagnosis,Metastasis,ICD Billing,ICD Billing,197.6 - METS - MET CA-PERITONEUM,INTRA_ABDOMINAL
5,P-0001115,26221,,Diagnosis,Metastasis,ICD Billing,ICD Billing,197.5 - METS - METS TO LG BOWEL,BOWEL
6,P-0001115,26211,,Diagnosis,Metastasis,ICD Billing,ICD Billing,198.1 - METS - METS TO URINARY TR,BLADDER_OR_URINARY_TRACT
7,P-0001115,26857,,Diagnosis,Metastasis,ICD Billing,ICD Billing,197.4 - METS - SM BOWEL METS,BOWEL


#### Lymph node annotations

In [104]:
df_met_sites_dx_ln = df_met_sites_dx[df_met_sites_dx['ANATOMIC_LOCATION'].isin(['LYMPH', 'OTHER'])]

df_met_sites_dx_ln = df_met_sites_dx_ln.assign(SUBTYPE='Lymph Nodes')
df_met_sites_dx_ln = df_met_sites_dx_ln[col_order]

In [105]:
# df_met_rdn_anno.head(50)
# t = df_met_rdn_anno.groupby('DMP_ID')['CANCER_TYPE'].nunique()
# df_met_rdn_anno[df_met_rdn_anno['DMP_ID'].isin(t[t > 1].index)].head(50)

df_met_sites_dx_ln.head()

Unnamed: 0,PATIENT_ID,START_DATE,STOP_DATE,EVENT_TYPE,SUBTYPE,SOURCE,SOURCE_SPECIFIC,DX_DESCRIPTION,ANATOMIC_LOCATION
0,P-0015526,17267,,Diagnosis,Lymph Nodes,ICD Billing,ICD Billing,C77.0 - Miscellaneous,LYMPH
2,P-0001115,26927,,Diagnosis,Lymph Nodes,ICD Billing,ICD Billing,196.6 - METS - LYMPH NODE METS-PELVIC,LYMPH
4,P-0001115,26211,,Diagnosis,Lymph Nodes,ICD Billing,ICD Billing,198.89 - METS - MET CANCER,OTHER
11,P-0002243,16133,,Diagnosis,Lymph Nodes,ICD Billing,ICD Billing,196.3 - METS - LYMPH NODE METS-AXILLA,LYMPH
12,P-0002243,20875,,Diagnosis,Lymph Nodes,ICD Billing,ICD Billing,196.1 - METS - LYMPH NODE METS-INTRATHOR,LYMPH


### Radiology report NLP predictions

## Merge dataframes

In [108]:
df_mets_f = pd.concat([df_met_sites_impact, df_met_sites_dx_vis], axis=0, sort=False)
df_mets_f = df_mets_f.sort_values(by=['PATIENT_ID', 'START_DATE', 'ANATOMIC_LOCATION']).reset_index(drop=True)

In [109]:
df_mets_f.head(50)

Unnamed: 0,PATIENT_ID,START_DATE,STOP_DATE,EVENT_TYPE,SUBTYPE,SOURCE,SOURCE_SPECIFIC,DX_DESCRIPTION,ANATOMIC_LOCATION
0,P-0000000,19332,,Diagnosis,Metastasis,ICD Billing,ICD Billing,C78.01 - Miscellaneous,LUNG
1,P-0000000,19332,,Diagnosis,Metastasis,ICD Billing,ICD Billing,C78.1 - Miscellaneous,MEDIASTINUM
2,P-0000000,20935,,Diagnosis,Metastasis,ICD Billing,ICD Billing,C78.6 - Miscellaneous,INTRA_ABDOMINAL
3,P-0000000,21624,,Diagnosis,Metastasis,ICD Billing,ICD Billing,198.82 - METS - GENITAL CA-SECONDARY,GENITAL_FEMALE
4,P-0000000,21624,,Diagnosis,Metastasis,ICD Billing,ICD Billing,198.6 - METS - MET CA TO OVARY,OVARY
5,P-0000000,22985,,Diagnosis,Metastasis,ICD Billing,ICD Billing,C79.82 - Miscellaneous,GENITAL_FEMALE
6,P-0000000,23730,,Diagnosis,Metastasis,ICD Billing,ICD Billing,C78.6 - Miscellaneous,INTRA_ABDOMINAL
7,P-0000000,23730,,Diagnosis,Metastasis,ICD Billing,ICD Billing,C78.02 - Miscellaneous,LUNG
8,P-0000000,23730,,Diagnosis,Metastasis,ICD Billing,ICD Billing,C78.01 - Miscellaneous,LUNG
9,P-0000000,23730,,Diagnosis,Metastasis,ICD Billing,ICD Billing,C78.2 - Miscellaneous,PLEURA


## Save annotations

In [38]:
# Save RDN annotations
df_met_sites_impact.to_csv(pathfilename_out, index=False)

## Create binary matrix from mapping

In [115]:
# df_binary = obj_mapping.create_binary_met_sites(df=df_met_sites_impact, col_index='SAMPLE_ID', col_count='DMP_ID', col_met_site='METASTATIC_SITE_BILLING_RDN')

In [None]:
df_binary.head(2)

In [None]:
cols = list(df_binary.columns[df_binary.columns.str.contains('HAS_')])
(df_binary[cols].sum()/df_binary.shape[0]).sort_values(ascending=False)