# met_site_data_creation_impact_scaled.py

### By Chris Fong - MSKCC 2022

This script will create a standardized table of metastatic sites for for a given 
- Clinical sample table downloaded from the cohort tab in cBioPortal 

This script will leveerage RDN's metastatic mapping from IMPACT patient cancer types to distant, lymphatic, regional metastatic disease

This script will ONLY cover metastatic samples from IMPACT cohort. ALL metastatic disease sites are not covered



## Load Libraries

In [1]:
import sys  
sys.path.insert(0, '../mappings')
sys.path.insert(0, '../analysis')
sys.path.insert(0, '../')
sys.path.insert(0, '/mind_data/fongc2/cdm-utilities/minio_api/')
sys.path.insert(0, '/mind_data/fongc2/cdm-utilities/')
sys.path.insert(0, '/mind_data/fongc2/diagnosis_event_abstraction_icd/cbioportal/')


import os
import pandas as pd
from minio_api import MinioAPI
import constants_o_sites as const
from organ_mapping_analysis import OrganMappingAnalysisRND
from organ_mapping_rdn_processing import MetastaticSpreadMappingRND
from dx_cbioportal_timeline import cBioPortalDiagnosisTimeline


from dotenv import load_dotenv, find_dotenv, dotenv_values


from utils import set_debug_console, print_df_without_index, drop_cols, mrn_zero_pad, convert_to_int, read_minio_api_config


In [2]:
set_debug_console()

## Object testing

In [3]:
const.fname_mapping_rdn_all_sites

'all_sites_clean.csv'

In [4]:
fname_minio_env = '/mind_data/fongc2/cdm-utilities/minio_env.txt'
fname_cbio = 'cbioportal/mskimpact_ids.tsv'
fname_dx = 'diagnosis/table_diagnosis_clean.csv'
fname_demo = 'demographics/ddp_demographics.tsv'
fname_impact_map = 'summary/IMPACT_Darwin_Sample_Summary.tsv'

obj_dx_timeline = cBioPortalDiagnosisTimeline(fname_minio_config=fname_minio_env, 
                                              config_cols=None, 
                                              const_organ_map=const, 
                                              fname_impact_map=fname_impact_map, 
                                              fname_demo=fname_demo, 
                                              fname_dx=fname_dx, 
                                              fname_cbio=fname_cbio, 
                                              fname_rad_pred=None)

Loading mapping tables


In [5]:
obj_dx_timeline.metastatic_clean_and_merge()

In [8]:
obj_dx_timeline._df_dx_ln_impact['DX_DESCRIPTION'].value_counts();

## Minio Setup

In [13]:
# env = load_dotenv('/mind_data/fongc2/cdm-utilities/minio_env.txt')
config = read_minio_api_config('/mind_data/fongc2/cdm-utilities/minio_env.txt')
dict_config = dict(config)

In [14]:
dict_config['MINIO_ENV']

'/mind_data/fongc2/.env'

In [15]:
load_dotenv(find_dotenv())
 
ACCESS_KEY = os.getenv("ACCESS_KEY")
SECRET_KEY = os.getenv('SECRET_KEY')
BUCKET = 'cdm-data'

ca_certs = '/mind_data/fongc2/certificate.crt'
url_port = "tllihpcmind6:9000"

In [16]:
obj_minio = MinioAPI(ACCESS_KEY=ACCESS_KEY, SECRET_KEY=SECRET_KEY, ca_certs=ca_certs, url_port=url_port)

In [17]:
obj_minio.print_list_objects(bucket_name=BUCKET, recursive=True, prefix="sandbox")

sandbox/.ipynb_checkpoints/
sandbox/fongc2/dx_met_timeline.txt


In [18]:
df_test = obj_minio.load_obj(bucket_name=BUCKET, 
                               path_object='sandbox/fongc2/dx_met_timeline.txt', 
                               sep='\t')
df_test.head()

Unnamed: 0,PATIENT_ID,START_DATE,STOP_DATE,EVENT_TYPE,SUBTYPE,SOURCE,SOURCE_SPECIFIC,DX_DESCRIPTION,ANATOMIC_LOCATION
0,P-0000000,19332,,Diagnosis,Metastasis,ICD Billing,ICD Billing,C78.01 - Miscellaneous,LUNG
1,P-0000000,19332,,Diagnosis,Metastasis,ICD Billing,ICD Billing,C78.1 - Miscellaneous,MEDIASTINUM
2,P-0000000,20935,,Diagnosis,Metastasis,ICD Billing,ICD Billing,C78.6 - Miscellaneous,INTRA_ABDOMINAL
3,P-0000000,21624,,Diagnosis,Metastasis,ICD Billing,ICD Billing,198.82 - METS - GENITAL CA-SECONDARY,GENITAL_FEMALE
4,P-0000000,21624,,Diagnosis,Metastasis,ICD Billing,ICD Billing,198.6 - METS - MET CA TO OVARY,OVARY


## Outputs

This script will create a cBioPortal formatted timeline file for metastatic site events
Events will be derived from four sources:
- Billing codes
- IMPACT sample info
- ICD-O registry
- Radiology reports (NLP predictions)

cBioPortal timeline file columns required:
- PATIENT_ID
- START_DATE
- STOP_DATE (For this file, can be left blank)
- EVENT_TYPE (Always "Diagnosis")
- SUBTYPE (For metastatic events, always "Metastasis")
- SOURCE (ICD Billing, MSK-IMPACT, Radiology Report)
- SOURCE_SPECIFIC (Imaging modality: CT, PET, MRI for radiology reports. Pathology for MSK-IMPACT. ICD Billing for billing codes.)
- DX_DESCRIPTION (ICD billing description, "Radiology Report", Pathology report specimen description (?))
- ANATOMIC_LOCATION (MSK-MET derived info)


In [19]:
col_order = ['PATIENT_ID', 
              'START_DATE', 
              'STOP_DATE',
              'EVENT_TYPE',
              'SUBTYPE',
              'SOURCE',
              'SOURCE_SPECIFIC',
              'DX_DESCRIPTION',
              'ANATOMIC_LOCATION']

### Filename for output

In [20]:
path = '../demo_data'
fname_save_anno = 'impact2017_met_site_annotations_impact_scaled.csv'
pathfilename_out = os.path.join(path, fname_save_anno)

## Load Data
- Demographics (for gender)
- ID mapping between dmp-id and mrn
- IMPACT sample data
- IMPACT sample summary (for DOP)
- Dx Timeline from Darwin
- Radiology report predictions

### Load cbioportal clinical data file

In [21]:
df_samples1 = obj_minio.load_obj(bucket_name=BUCKET, 
                               path_object='cbioportal/mskimpact_ids.tsv', 
                               sep='\t')

# For genie, fix ids
col_id = 'SAMPLE_ID'
col_id2 = 'DMP_ID'
col_sex = 'GENDER'
col_sample_type = 'SAMPLE_TYPE'
col_prim_site = 'PRIMARY_SITE'
col_met_site = 'METASTATIC_SITE'
col_icd_billing = 'ICD-9/10 Dx Code'


col_rep = {'patientId': col_id2,
           'sampleId': col_id}

cols_keep = ['SAMPLE_ID',
             'DMP_ID',
             'CANCER_TYPE',
             'SAMPLE_TYPE',
             'PRIMARY_SITE',
             'METASTATIC_SITE'
            ]

df_samples = df_samples1.rename(columns=col_rep)
# df_samples = df_samples1[list(col_rep.values())]
logic1 = df_samples[col_sample_type] == 'Metastasis'
df_samples = df_samples.loc[logic1, cols_keep]


In [22]:
df_samples1.head(1)


Unnamed: 0,sampleId,patientId,AGE_AT_SEQ_REPORTED_YEARS,ARCHER,CANCER_TYPE,CANCER_TYPE_DETAILED,CVR_TMB_COHORT_PERCENTILE,CVR_TMB_SCORE,CVR_TMB_TT_COHORT_PERCENTILE,DATE_ADDED,FRACTION_GENOME_ALTERED,GENE_PANEL,INSTITUTE,METASTATIC_SITE,MGMT_STATUS,MONTH_ADDED,MSI_COMMENT,MSI_SCORE,MSI_TYPE,MSK_SLIDE_ID,MUTATION_COUNT,ONCOTREE_CODE,PATH_SLIDE_EXISTS,PRIMARY_SITE,SAMPLE_CLASS,SAMPLE_COVERAGE,SAMPLE_TYPE,SOMATIC_STATUS,SO_COMMENTS,TUMOR_PURITY,WEEK_ADDED,WHO_GRADE
0,P-0000004-T01-IM3,P-0000004,40,NO,Breast Cancer,Breast Invasive Ductal Carcinoma,58.6,4.5,67.8,2015/04/07,0.2782,IMPACT341,MSKCC,,,2015/04,,2.5,Stable,,4,IDC,NO,Breast,Tumor,428.0,Primary,Matched,,50,"2015, Wk. 15",


### Load demographics

In [35]:
df_demo = obj_minio.load_obj(bucket_name=BUCKET, 
                               path_object='demographics/ddp_demographics.tsv', 
                               sep='\t')

df_demo = mrn_zero_pad(df=df_demo, col_mrn='MRN')
df_demo = df_demo[['MRN', 'PT_BIRTH_DTE', 'GENDER']]

df_demo['PT_BIRTH_DTE'] = pd.to_datetime(df_demo['PT_BIRTH_DTE'])



In [36]:
df_demo.head(1);


### Load sample summary data

In [24]:
df_sample_summary = obj_minio.load_obj(bucket_name=BUCKET, 
                               path_object='summary/IMPACT_Darwin_Sample_Summary.tsv', 
                               sep='\t')

df_sample_summary = convert_to_int(df=df_sample_summary, list_cols=['MRN'])
df_sample_summary = mrn_zero_pad(df=df_sample_summary, col_mrn='MRN')

cols_keep = ['MRN', 'SAMPLE_ID', 'DMP_ID', 'DATE_OF_PROCEDURE_SURGICAL_EST']
df_sample_summary = df_sample_summary[cols_keep]

df_sample_summary['DATE_OF_PROCEDURE_SURGICAL_EST'] = pd.to_datetime(df_sample_summary['DATE_OF_PROCEDURE_SURGICAL_EST'])

In [25]:
df_sample_summary;

### Load ID mapping

In [26]:
df_id_map = obj_minio.load_obj(bucket_name=BUCKET, 
                               path_object='id_mapping/ddp_id_mapping_pathology.tsv', 
                               sep='\t')

df_id_map = mrn_zero_pad(df=df_id_map, col_mrn='MRN')
df_id_map = df_id_map[['MRN', 'DMP_ID']].drop_duplicates()
df_id_map = df_id_map[df_id_map['DMP_ID'].notnull()]



In [27]:
df_id_map.head(1);


### Load Dx timeline data

In [28]:
df_dx_timeline = obj_minio.load_obj(bucket_name=BUCKET, 
                               path_object='diagnosis/table_diagnosis_clean.csv', 
                               sep=',')

df_dx_timeline = mrn_zero_pad(df=df_dx_timeline, col_mrn='MRN')

df_dx_timeline['Diagnosis Date'] = pd.to_datetime(df_dx_timeline['Diagnosis Date'])


#### Tumor Registry (Add this to Primary Dx timeline)

In [29]:
cols_dx_treg = ['MRN', 'Diagnosis Date', 
           'IS_STAGE_IV_ICDO_SOLID', 'ICDO_IS_DISTANT_SOLID']
df_dx_treg = df_dx_timeline.loc[df_dx_timeline['Source'] == 'Tumor Registry', cols_dx_treg]
logic = (df_dx_treg['IS_STAGE_IV_ICDO_SOLID'] == True) | (df_dx_treg['ICDO_IS_DISTANT_SOLID'] == True)
df_dx_treg_mets = df_dx_treg[logic]

df_dx_treg_mets = df_dx_treg_mets.drop(columns=['IS_STAGE_IV_ICDO_SOLID', 'ICDO_IS_DISTANT_SOLID'])

In [30]:
df_dx_treg_mets.head(1);


#### Billing codes

In [31]:
cols_dx_billing = ['MRN', 'ICD-9/10 Dx Code', 'Diagnosis Description', 'Diagnosis Date']

logic1 = df_dx_timeline['Diagnosis Type'] == 'METS'
logic2 = df_dx_timeline['Source'] == 'Billing'
logics = logic1 & logic2
df_dx_billing = df_dx_timeline.loc[logics, cols_dx_billing]


In [32]:
df_dx_billing.head(1);


### Load Radiology report met predictions

In [33]:
# TBD


## Merge and clean input data
- Add date of birth and sex/gender to dataframes
- Add impact IDs
- Select important columns

### Sample data

In [37]:
df_samples_f = df_sample_summary.merge(right=df_samples, how='right', on=['SAMPLE_ID', 'DMP_ID'])
df_samples_f = df_samples_f.merge(right=df_demo, how='left', on='MRN')

age_dop = (df_samples_f['DATE_OF_PROCEDURE_SURGICAL_EST'] - df_samples_f['PT_BIRTH_DTE']).dt.days
df_samples_f = df_samples_f.assign(AGE_DX=age_dop)

df_samples_f = df_samples_f[df_samples_f['AGE_DX'].notnull()]
df_samples_f['AGE_DX'] = df_samples_f['AGE_DX'].astype(int) 
df_samples_f = df_samples_f.drop(columns=['MRN', 'DATE_OF_PROCEDURE_SURGICAL_EST', 'PT_BIRTH_DTE'])


In [38]:
print(df_samples_f.shape)
df_samples_f.head(1)


(25105, 8)


Unnamed: 0,SAMPLE_ID,DMP_ID,CANCER_TYPE,SAMPLE_TYPE,PRIMARY_SITE,METASTATIC_SITE,GENDER,AGE_DX
0,P-0000012-T03-IM3,P-0000012,Non-Small Cell Lung Cancer,Metastasis,Lung,Neck,FEMALE,21180


### Billing code data

In [None]:
df_dx_billing_f = df_dx_billing.merge(right=df_demo, how='left', on='MRN')
# df_id_map = df_sample_summary[['DMP_ID', 'MRN']].drop_duplicates()
df_dx_billing_f = df_id_map.merge(right=df_dx_billing_f, how='right', on='MRN')

age_dx = (df_dx_billing_f['Diagnosis Date'] - df_dx_billing_f['PT_BIRTH_DTE']).dt.days
df_dx_billing_f = df_dx_billing_f.assign(AGE_DX=age_dx)

df_dx_billing_f = df_dx_billing_f.drop(columns=['Diagnosis Date', 'PT_BIRTH_DTE', 'MRN'])
df_dx_billing_f = df_dx_billing_f[df_dx_billing_f['DMP_ID'].notnull()]

In [None]:
df_dx_billing_f.head(1)

### Tumor registry data

In [None]:
df_dx_treg_mets_f = df_dx_treg_mets.merge(right=df_demo, how='left', on='MRN')
# df_id_map = df_sample_summary[['DMP_ID', 'MRN']].drop_duplicates()
df_dx_treg_mets_f = df_id_map.merge(right=df_dx_treg_mets_f, how='right', on='MRN')

age_dx = (df_dx_treg_mets_f['Diagnosis Date'] - df_dx_treg_mets_f['PT_BIRTH_DTE']).dt.days
df_dx_treg_mets_f = df_dx_treg_mets_f.drop(columns=['MRN', 'GENDER', 'Diagnosis Date', 'PT_BIRTH_DTE'])
df_dx_treg_mets_f = df_dx_treg_mets_f[df_dx_treg_mets_f['DMP_ID'].notnull()]

df_dx_treg_mets_f = df_dx_treg_mets_f.assign(AGE_DX=age_dx)
df_dx_treg_mets_f = df_dx_treg_mets_f.assign(ORGAN_SITE='')
df_dx_treg_mets_f = df_dx_treg_mets_f.assign(SOURCE='Tumor Registry')


In [None]:
df_dx_treg_mets_f

## Load Mapping Objects

### Load mapping tables

In [45]:
# Load RDN mapping
obj_met_map_rdn = MetastaticSpreadMappingRND(path=const.pathname,
                                             fname_all_sites=const.fname_mapping_rdn_all_sites,
                                             fname_hematogenous=const.fname_mapping_rdn_hematogenous,
                                             fname_localext=const.fname_mapping_rdn_localext,
                                             fname_lymphatic=const.fname_mapping_rdn_lymphatic,
                                             fname_site_map=const.fname_mapping_rdn_site_map,
                                             fname_billing_map=const.fname_mapping_rdn_billing_map,
                                             fname_billing_code_dict=const.fname_mapping_rdn_to_billing_codes)


Loading mapping tables


In [46]:
const.pathname


'/mind_data/fongc2/organ-site-mapping/mappings'

### Load mapping object

In [47]:
# Load annoations object
obj_mapping = OrganMappingAnalysisRND(obj_met_map=obj_met_map_rdn)


In [48]:
obj_met_map_rdn.df_map_icd_mapping.head()


Unnamed: 0,tissue_icd_billing,clean_site,ICD_BILLING_MAPPING
0,ADRENAL_GLAND,site1,retroperitoneum_adrenal
1,BILIARY_TRACT,site1,abdomen_biliary
2,BLADDER_OR_URINARY_TRACT,site1,pelvis_bladder
3,BONE,site1,back_bone.spine
4,BOWEL,site1,abdomen_foregut


## Apply mappings and format frames into cBioPortal timeline format
### Clinical Samples Table from cBioPortal

In [49]:
# Annotate IMPACT sample site data
df_met_sites_impact = obj_mapping.annotate_mapping_impact_met_samples(df_samples=df_samples_f, 
                                                                      col_primary_site=col_prim_site, 
                                                                      col_met_site=col_met_site, 
                                                                      label_dist_ln=True)

cols_keep = ['DMP_ID', 'AGE_DX', 'METASTATIC_SITE_BILLING_RDN', 'METASTATIC_SITE']
df_met_sites_impact = df_met_sites_impact[cols_keep]
df_met_sites_impact = df_met_sites_impact.assign(SOURCE='MSK-IMPACT')
df_met_sites_impact = df_met_sites_impact.assign(STOP_DATE='')
df_met_sites_impact = df_met_sites_impact.assign(EVENT_TYPE='Diagnosis')
df_met_sites_impact = df_met_sites_impact.assign(SUBTYPE='Metastasis')
df_met_sites_impact = df_met_sites_impact.assign(SOURCE='MSK-IMPACT')
df_met_sites_impact = df_met_sites_impact.assign(SOURCE_SPECIFIC='Pathology')

df_met_sites_impact = df_met_sites_impact.rename(columns={'DMP_ID': 'PATIENT_ID',
                                                          'METASTATIC_SITE_BILLING_RDN': 'ANATOMIC_LOCATION',
                                                          'AGE_DX': 'START_DATE',
                                                          'METASTATIC_SITE': 'DX_DESCRIPTION'
                                                         })


In [50]:
df_met_sites_impact.head()

Unnamed: 0,PATIENT_ID,START_DATE,ANATOMIC_LOCATION,DX_DESCRIPTION,SOURCE,STOP_DATE,EVENT_TYPE,SUBTYPE,SOURCE_SPECIFIC
0,P-0000012,21180,HEAD_AND_NECK,Neck,MSK-IMPACT,,Diagnosis,Metastasis,Pathology
1,P-0000015,16226,LIVER,Liver,MSK-IMPACT,,Diagnosis,Metastasis,Pathology
2,P-0000024,22281,LUNG,Lung,MSK-IMPACT,,Diagnosis,Metastasis,Pathology
3,P-0000025,27264,INTRA_ABDOMINAL,Peritoneum,MSK-IMPACT,,Diagnosis,Metastasis,Pathology
4,P-0000025,27600,LIVER,Liver,MSK-IMPACT,,Diagnosis,Metastasis,Pathology


#### Non-LN organs

In [51]:
df_met_sites_impact_vis = df_met_sites_impact[~df_met_sites_impact['ANATOMIC_LOCATION'].isin(['LYMPH'])]

df_met_sites_impact_vis = df_met_sites_impact_vis.assign(SUBTYPE='Metastasis')
df_met_sites_impact_vis = df_met_sites_impact_vis[col_order]

#### Lymph node annotations

In [53]:
df_met_sites_impact_ln = df_met_sites_impact[df_met_sites_impact['ANATOMIC_LOCATION'].isin(['LYMPH'])]
df_met_sites_impact_ln = df_met_sites_impact_ln.assign(SUBTYPE='Lymph Nodes')
df_met_sites_impact_ln = df_met_sites_impact_ln[col_order]

In [55]:
df_met_sites_impact_ln.head()

Unnamed: 0,PATIENT_ID,START_DATE,STOP_DATE,EVENT_TYPE,SUBTYPE,SOURCE,SOURCE_SPECIFIC,DX_DESCRIPTION,ANATOMIC_LOCATION
5,P-0000026,26146,,Diagnosis,Lymph Nodes,MSK-IMPACT,Pathology,Pelvis,OTHER
6,P-0000030,24885,,Diagnosis,Lymph Nodes,MSK-IMPACT,Pathology,Lymph Node,LYMPH
13,P-0000066,22617,,Diagnosis,Lymph Nodes,MSK-IMPACT,Pathology,Parasternal Mass,OTHER
16,P-0000067,21377,,Diagnosis,Lymph Nodes,MSK-IMPACT,Pathology,Pelvis,OTHER
17,P-0000068,28081,,Diagnosis,Lymph Nodes,MSK-IMPACT,Pathology,Lymph Node,LYMPH
19,P-0000077,26774,,Diagnosis,Lymph Nodes,MSK-IMPACT,Pathology,Lymph Node,LYMPH
23,P-0000085,26891,,Diagnosis,Lymph Nodes,MSK-IMPACT,Pathology,Lymph Node,LYMPH
25,P-0000088,22599,,Diagnosis,Lymph Nodes,MSK-IMPACT,Pathology,Lymph Node,LYMPH
27,P-0000100,20636,,Diagnosis,Lymph Nodes,MSK-IMPACT,Pathology,Pelvis,OTHER
29,P-0000103,16858,,Diagnosis,Lymph Nodes,MSK-IMPACT,Pathology,Lymph Node,LYMPH


### ICD Billing table 

In [None]:
# Annotate diagnosis table of ICD billings with renzo's mapping of metastatic sites
df_met_sites_dx = obj_mapping.annotate_icd_billing_met_dx(df_dx_mets=df_dx_billing_f, 
                                                          col_icd_billing=col_icd_billing, 
                                                          col_sex=col_sex,
                                                          add_msk_met_anno=True)

df_met_sites_dx['Diagnosis Description'] = df_met_sites_dx['ICD-9/10 Dx Code'] + ' - ' + df_met_sites_dx['Diagnosis Description']
df_met_sites_dx = df_met_sites_dx.drop(columns=['ICD-9/10 Dx Code', 'GENDER', 'METASTATIC_SITE_RDN_MAP'])

df_met_sites_dx = df_met_sites_dx.assign(SOURCE='ICD Billing')
df_met_sites_dx = df_met_sites_dx.assign(STOP_DATE='')
df_met_sites_dx = df_met_sites_dx.assign(EVENT_TYPE='Diagnosis')
df_met_sites_dx = df_met_sites_dx.assign(SOURCE_SPECIFIC='ICD Billing')

df_met_sites_dx = df_met_sites_dx.rename(columns={'DMP_ID': 'PATIENT_ID',
                                                  'tissue_icd_billing': 'ANATOMIC_LOCATION',
                                                  'AGE_DX': 'START_DATE',
                                                  'Diagnosis Description': 'DX_DESCRIPTION'
                                                  }
                                        )
df_met_sites_dx

In [None]:
print(df_dx_billing_f.shape)
print(df_met_sites_dx.shape)


#### Non-LN organs

In [None]:
df_met_sites_dx_vis = df_met_sites_dx[~df_met_sites_dx['ANATOMIC_LOCATION'].isin(['LYMPH', 'OTHER'])]

df_met_sites_dx_vis = df_met_sites_dx_vis.assign(SUBTYPE='Metastasis')
df_met_sites_dx_vis = df_met_sites_dx_vis[col_order]

In [None]:
df_met_sites_dx_vis.head()

#### Lymph node annotations

In [None]:
df_met_sites_dx_ln = df_met_sites_dx[df_met_sites_dx['ANATOMIC_LOCATION'].isin(['LYMPH', 'OTHER'])]

df_met_sites_dx_ln = df_met_sites_dx_ln.assign(SUBTYPE='Lymph Nodes')
df_met_sites_dx_ln = df_met_sites_dx_ln[col_order]

In [None]:
# df_met_rdn_anno.head(50)
# t = df_met_rdn_anno.groupby('DMP_ID')['CANCER_TYPE'].nunique()
# df_met_rdn_anno[df_met_rdn_anno['DMP_ID'].isin(t[t > 1].index)].head(50)

df_met_sites_dx_ln.head()

### Radiology report NLP predictions

## Merge dataframes

In [None]:
df_mets_f = pd.concat([df_met_sites_impact, df_met_sites_dx_vis], axis=0, sort=False)
df_mets_f = df_mets_f.sort_values(by=['PATIENT_ID', 'START_DATE', 'ANATOMIC_LOCATION']).reset_index(drop=True)

In [None]:
df_mets_f.head(50)

## Save annotations

In [None]:
# Save RDN annotations
# df_mets_f.to_csv(pathfilename_out, index=False)
obj_minio.save_obj(df=df_mets_f, bucket_name=BUCKET, path_object='sandbox/fongc2/dx_met_timeline.txt', sep='\t')


## Create binary matrix from mapping

In [None]:
# df_binary = obj_mapping.create_binary_met_sites(df=df_met_sites_impact, col_index='SAMPLE_ID', col_count='DMP_ID', col_met_site='METASTATIC_SITE_BILLING_RDN')

In [None]:
df_binary.head(2)

In [None]:
cols = list(df_binary.columns[df_binary.columns.str.contains('HAS_')])
(df_binary[cols].sum()/df_binary.shape[0]).sort_values(ascending=False)