In [1]:
import requests
import json
import pandas as pd

In [2]:
files_endpt = "https://api.gdc.cancer.gov/files"

# The 'fields' parameter is passed as a comma-separated string of single names.
fields = [
    "file_name",
    #"analysis.workflow_type",
    #"experimental_strategy",
    "cases.project.primary_site",
    "cases.project.disease_type",
    "cases.diagnoses.primary_diagnosis",
    "cases.submitter_id"
    #"cases.project.project_id"
]

fields = ','.join(fields)

In [None]:
#miRNA filters
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["miRNA Expression Quantification"]
            }
        },
          {
        "op": "in",
        "content":{
               "field": "files.experimental_strategy",
                "value": ["miRNA-Seq"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.primary_site",
            "value": ["Breast", "Brain", "Kidney", "Bronchus and lung", "Thyroid gland", "Corpus uteri", "Prostate gland", "Ovary", "Stomach"]
            }
        }
        
    ]
}

In [18]:
# Tissue project filters
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["Gene Expression Quantification"]
            }
        },
        {
        "op": "in",
        "content":{
               "field": "files.analysis.workflow_type",
                "value": ["HTSeq - Counts"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.project_id",
            "value": ["TCGA-COAD"]
            }
        }
        
    ]
}

In [30]:
# all files RNA-seq
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["Gene Expression Quantification"]
            }
        },
        {
        "op": "in",
        "content":{
               "field": "files.analysis.workflow_type",
                "value": ["HTSeq - Counts"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.program.name",
            "value": ["TCGA"]
            }
        }
        
    ]
}

In [31]:
# A POST is used, so the filter parameters can be passed directly as a Dict object.
##remove manifest to have file fields
params = {
   "return_type": "manifest",
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "15000"
    }

In [32]:
# The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)

print(response.content.decode("utf-8"))

id	filename	md5	size	state
5dd56b1e-be8e-46a2-9bd2-c7da4bb7e30d	4810fb0a-881e-40be-9705-9c41b143e83b.htseq.counts.gz	3e8b98ef4a0aa9b205c8ca5c7cf1b946	255934	released
59eddb0a-a7ff-4bfe-b62a-331438574630	d5fd50e4-2068-4cfc-9b04-3976f9fd7940.htseq.counts.gz	402d975171ce5bdc09b200d997babb04	256072	released
1ee599de-8bba-4130-a633-a6024dcfbff0	89658cba-f8bd-49a1-99a7-265f7d5195c8.htseq.counts.gz	629f6b7f006fae2dcf2b3f3360e9402f	254889	released
092c1a6f-018a-4acf-8a42-2b0b2944d9e8	1dc8075b-ed0c-471f-8947-967d23ff686b.htseq.counts.gz	f7e8cf47789c33d94d44ecf649923349	256516	released
c908223e-d41c-4d11-841b-da4dfc35c83d	4f2b6b76-3a34-4cb6-9ec3-e343171ff2c2.htseq.counts.gz	e670573cd49c9c728c9bb71f887c2ec8	249106	released
b84b9118-3a5a-41c0-bc25-0d83c7a82655	94823ae2-8fb8-4be7-b2ca-88dff791b29b.htseq.counts.gz	47cb1de12d28c9040217c73b089de320	256377	released
f0a6a65a-9ee6-4df5-9f02-d1120b1efd9e	ee6fc916-052f-4fab-974f-119ab34078d6.htseq.counts.gz	e3380e0e176e4eb7913f10e2acef14b6	257724	released


In [33]:
len(response.content.decode("utf-8"))

1541954

In [34]:
with open("manifest.txt","w") as manifest:
    manifest.write(response.content.decode("utf-8"))
    manifest.close()

## Files

In [26]:
params = {
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "2000"
    }
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
with open("files.txt","w") as files:
    files.write(response.content.decode("utf-8"))
    files.close()

In [27]:
df_files = pd.read_csv("files.txt", sep='\t')
df_files.drop("id", axis=1, inplace=True)
df_files.set_index("file_name", inplace=True)
df_files.columns=['primary_site','tcga_id','disease_type','primary_diagnosis']
#df_files=df_files.reindex(columns=['primary_site','disease_type','primary_diagnosis','case_id'], copy=False)
df_files.head()

Unnamed: 0_level_0,primary_site,tcga_id,disease_type,primary_diagnosis
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3abbd2b5-04db-4fe0-8dd1-ea2b48caa4c1.htseq.counts.gz,Colorectal,TCGA-A6-6654,Colon Adenocarcinoma,"Adenocarcinoma, NOS"
087666cd-47ae-4f56-b947-d6aa1c25e8a7.htseq.counts.gz,Colorectal,TCGA-DM-A1D4,Colon Adenocarcinoma,"Adenocarcinoma, NOS"
13abc91e-fbfc-4c55-bf54-fbd134979ccc.htseq.counts.gz,Colorectal,TCGA-A6-5657,Colon Adenocarcinoma,"Adenocarcinoma, NOS"
d334512c-d092-459a-aab7-a3a0e281d9d4.htseq.counts.gz,Colorectal,TCGA-A6-2683,Colon Adenocarcinoma,"Adenocarcinoma, NOS"
9bf28651-4429-4fe5-9e9e-dda3a2aaa221.htseq.counts.gz,Colorectal,TCGA-A6-6140,Colon Adenocarcinoma,"Adenocarcinoma, NOS"


In [28]:
df_files.to_csv("files.dat", header=True)

In [11]:
df_files.columns

Index(['primary_site', 'tcga_id', 'disease_type', 'primary_diagnosis'], dtype='object')