In [14]:
import requests
import json
import pandas as pd

In [15]:
files_endpt = "https://api.gdc.cancer.gov/files"

# The 'fields' parameter is passed as a comma-separated string of single names.
fields = [
    "file_name",
    #"analysis.workflow_type",
    #"experimental_strategy",
    "cases.project.primary_site",
    "cases.project.disease_type",
    #"cases.project.project_id"
]

fields = ','.join(fields)

In [3]:
#miRNA filters
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["miRNA Expression Quantification"]
            }
        },
          {
        "op": "in",
        "content":{
               "field": "files.experimental_strategy",
                "value": ["miRNA-Seq"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.primary_site",
            "value": ["Breast", "Brain", "Kidney", "Bronchus and lung", "Thyroid gland", "Corpus uteri", "Prostate gland", "Ovary", "Stomach"]
            }
        }
        
    ]
}

In [None]:
# Colon filters
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["Gene Expression Quantification"]
            }
        },
        {
        "op": "in",
        "content":{
               "field": "files.analysis.workflow_type",
                "value": ["HTSeq - Counts"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.project_id",
            "value": ["TCGA-COAD"]
            }
        }
        
    ]
}

In [177]:
# all files RNA-seq
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["Gene Expression Quantification"]
            }
        },
        {
        "op": "in",
        "content":{
               "field": "files.analysis.workflow_type",
                "value": ["HTSeq - Counts"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.primary_site",
            "value": ['Hematopoietic and reticuloendothelial systems']
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.program.name",
            "value": ["TCGA"]
            }
        }
        
    ]
}

In [None]:
'Bronchus and lung', "Thyroid gland", "Corpus uteri", "Prostate gland", "Ovary", "Stomach"

In [178]:
# A POST is used, so the filter parameters can be passed directly as a Dict object.
##remove manifest to have file fields
params = {
    "return_type": "manifest",
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "100"
    }

In [179]:
# The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)

print(response.content.decode("utf-8"))

id	filename	md5	size	state
a2dc4c89-9ea1-4faa-b51e-914ef373f32e	fe4e1221-8143-485e-8010-87787d40fa95.htseq.counts.gz	ffa9fe4aca674478a590402f1471b24b	259750	released
b5b95505-4b21-4db6-bf4d-1e189d0971b8	4619b035-e95d-4267-9d74-e58bdbaf5d43.htseq.counts.gz	b994c18406e8a896e04d73f952fd95c3	256538	released
0ecdf0e9-6a1f-42da-a42f-51bc69cc814a	6fc4ab63-6afc-4de5-a774-2ae0badb4f55.htseq.counts.gz	981f2fc401a79aea6b46f9e855185c6a	259259	released
58cb229f-403f-4730-a07c-2a72837d3b9c	93905cf5-5bfa-476a-9684-2b86c0062bf1.htseq.counts.gz	079b73164d7c93557a4f14633737bb1b	256717	released
f7abe018-46a4-4588-957e-36c31225182f	d9e24fe8-8d7e-4481-a14e-d2092a6739a4.htseq.counts.gz	61e3bb9e0ab954f33c76371013e491e6	258402	released
84e5923d-510e-4eb6-8acd-d613e92f585a	34fa50c4-df8f-4592-a118-5dcf94909d24.htseq.counts.gz	d7e65cb02c62b0a3928f7d1924b76c0b	257899	released
3b7ced7a-aff1-4c52-9b18-8b37a57d852f	08287c2e-6b09-4902-baf1-c5062eb1da6c.htseq.counts.gz	1e9329fe198f583fccfe9a9dd91aff95	256996	released


In [180]:
len(response.content.decode("utf-8"))

13927

In [181]:
with open("manifest.txt","a") as manifest:
    manifest.write(response.content.decode("utf-8"))
    manifest.close()

## Files

In [182]:
params = {
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "100"
    }
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
with open("files.txt","a") as files:
    files.write(response.content.decode("utf-8"))
    files.close()

In [183]:
df_files = pd.read_csv("files.txt", sep='\t', index_col=0)
df_files.drop("id", axis=1, inplace=True)
df_files.columns=['disease_type','primary_site']
df_files.head()

Unnamed: 0_level_0,disease_type,primary_site
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1
1ede5c59-96ae-486b-ae1f-5f7c7909cf6a.htseq.counts.gz,Glioblastoma Multiforme,Brain
026527d4-007c-4c4e-9bd5-855c44bbe7b0.htseq.counts.gz,Glioblastoma Multiforme,Brain
c8460566-8199-4a4d-93fd-67692611992d.htseq.counts.gz,Glioblastoma Multiforme,Brain
2afdb646-75ca-4bc9-9c12-30a27f994ecd.htseq.counts.gz,Glioblastoma Multiforme,Brain
bcbb79d8-1d4a-4fbb-b16c-4df86839773e.htseq.counts.gz,Glioblastoma Multiforme,Brain


In [187]:
df_files[df_files!="cases.0.project.primary_site"].to_csv("files.dat")

In [None]:
df_files.to_csv("files_new.dat", index=True, header=True)