## Download microsatellite instability groups

We download pre-computed MSI status information from Firebrowse, as described in the supplement [of this paper](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1008878).

This information exists for 4 cancer types: COAD, STAD, READ, UCEC.

In [1]:
import os
import pandas as pd
import urllib.request
import tarfile

import mpmp.config as cfg

In [2]:
# URL locations of zip files containing clinical info
clinical_zip_files = {
    'COADREAD': 'http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/COADREAD/20160128/gdac.broadinstitute.org_COADREAD.Clinical_Pick_Tier1.Level_4.2016012800.0.0.tar.gz',
    'STAD': 'http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/STAD/20160128/gdac.broadinstitute.org_STAD.Clinical_Pick_Tier1.Level_4.2016012800.0.0.tar.gz',
    'UCEC': 'http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/UCEC/20160128/gdac.broadinstitute.org_UCEC.Clinical_Pick_Tier1.Level_4.2016012800.0.0.tar.gz'
}

# where to save extracted clinical tsv files
msi_data_dir = os.path.join(cfg.data_dir, 'msi_data')
os.makedirs(msi_data_dir, exist_ok=True)

In [3]:
def download_and_extract_firebrowse(cancer_type):
    """Function to download and extract clinical data for the given cancer type."""
    
    # set filenames for target cancer type
    zip_file_url = clinical_zip_files[cancer_type]
    download_file = os.path.join(msi_data_dir, 
                                 os.path.split(zip_file_url)[-1])
    download_dir = os.path.split(zip_file_url)[-1].replace('.tar.gz', '')
    print(download_file, download_dir)
    
    # retrieve compressed file from firebrowse
    urllib.request.urlretrieve(zip_file_url, download_file)
    # extract clinical data file from .tar.gz
    tar_file = tarfile.open(download_file, 'r:gz')
    tar_file.extract('gdac.broadinstitute.org_{}.Clinical_Pick_Tier1.Level_4.2016012800.0.0/{}.clin.merged.picked.txt'.format(
                         cancer_type, cancer_type),
                     msi_data_dir)
    tar_file.close()
    
    # move clinical data up one dir, and remove tar dir
    clinical_untar = os.path.join(msi_data_dir, download_dir,
                                  '{}.clin.merged.picked.txt'.format(cancer_type))
    clinical_move_to = os.path.join(msi_data_dir, '{}.clin.merged.picked.txt'.format(cancer_type))
    
    # clean up untarred stuff
    os.rename(clinical_untar, clinical_move_to)
    os.remove(download_file)
    os.rmdir(os.path.join(msi_data_dir, download_dir))
    
    # return downloaded tsv filename
    return clinical_move_to
   

In [4]:
coadread_clinical_file = download_and_extract_firebrowse('COADREAD') 
print(coadread_clinical_file)

/home/jake/research/mpmp_2/data/msi_data/gdac.broadinstitute.org_COADREAD.Clinical_Pick_Tier1.Level_4.2016012800.0.0.tar.gz gdac.broadinstitute.org_COADREAD.Clinical_Pick_Tier1.Level_4.2016012800.0.0
/home/jake/research/mpmp_2/data/msi_data/COADREAD.clin.merged.picked.txt


In [5]:
coadread_clinical_df = (
    pd.read_csv(coadread_clinical_file, sep='\t', index_col=0)
      .transpose()
)
print(coadread_clinical_df.shape)
coadread_clinical_df.head()

(629, 19)


Hybridization REF,Composite Element REF,years_to_birth,vital_status,days_to_death,days_to_last_followup,tumor_tissue_site,pathologic_stage,pathology_T_stage,pathology_N_stage,pathology_M_stage,gender,date_of_initial_pathologic_diagnosis,days_to_last_known_alive,radiation_therapy,histological_type,tumor_stage,residual_tumor,number_of_lymph_nodes,ethnicity
tcga-a6-2677,value,68,1,740.0,,colon,stage iiic,t3,n2,m0,female,2009,,no,colon adenocarcinoma,,r0,5,not hispanic or latino
tcga-a6-2681,value,73,0,,1387.0,colon,stage iia,t3,n0,m0,female,2009,,no,colon adenocarcinoma,,r0,0,not hispanic or latino
tcga-a6-2682,value,70,1,424.0,,colon,stage iv,t4b,n1,m1,male,2009,,no,colon adenocarcinoma,,r0,2,not hispanic or latino
tcga-a6-4105,value,79,1,442.0,,colon,stage iia,t3,n0,m0,male,2010,,no,colon adenocarcinoma,,,0,not hispanic or latino
tcga-a6-6649,value,66,0,,735.0,colon,stage iiib,t3,n1b,m0,male,2010,,no,colon adenocarcinoma,,,2,not hispanic or latino
