In [1]:
import requests
import json
import pandas as pd

In [30]:
files_endpt = "https://api.gdc.cancer.gov/files"

# The 'fields' parameter is passed as a comma-separated string of single names.
fields = [
    "file_name",
    #"analysis.workflow_type",
    #"experimental_strategy",
    "cases.project.primary_site",
    "cases.project.disease_type",
    #"cases.diagnoses.primary_diagnosis",
    "cases.submitter_id",
    "cases.samples.portions.analytes.aliquots.submitter_id",
    "cases.diagnoses.tumor_stage",
    "cases.diagnoses.tumor_grade",
    "cases.diagnoses.progression_or_recurrence"
    #"cases.diagnoses.vital_status",
    #"cases.diagnoses.days_to_birth"
    #"cases.project.project_id"
]

fields = ','.join(fields)

In [None]:
#miRNA filters
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["miRNA Expression Quantification"]
            }
        },
          {
        "op": "in",
        "content":{
               "field": "files.experimental_strategy",
                "value": ["miRNA-Seq"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.primary_site",
            "value": ['Brain', 'Breast', 'Kidney', 'Lung', 'Thyroid', 'Uterus',
       'Prostate', 'Ovary', 'Lymph Nodes', 'Soft Tissue', 'Esophagus',
       'Stomach', 'Bone Marrow']
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.program.name",
            "value": ["TCGA"]
            }
        }
        
    ]
}

In [31]:
# Tissue project filters
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["Gene Expression Quantification"]
            }
        },
        {
        "op": "in",
        "content":{
               "field": "files.analysis.workflow_type",
                "value": ["HTSeq - FPKM"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.project_id",
            "value": ['TCGA-BRCA']
            }
        }
        
    ]
}

In [None]:
# all files RNA-seq
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["Gene Expression Quantification"]
            }
        },
        {
        "op": "in",
        "content":{
               "field": "files.analysis.workflow_type",
                "value": ["HTSeq - Counts"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.program.name",
            "value": ["TCGA"]
            }
        }
        
    ]
}

In [32]:
# A POST is used, so the filter parameters can be passed directly as a Dict object.
##remove manifest to have file fields
params = {
   "return_type": "manifest",
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "15000"
    }

In [33]:
# The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)

print(response.content.decode("utf-8"))

id	filename	md5	size	state
c5bab183-ecef-499d-b9da-551bbc6dfb4d	22c16f46-950f-4389-bb16-b5cec4d8b94e.FPKM.txt.gz	a60687589674e51ceb8000ec33dfd18a	511279	released
63ac6bf9-595f-4dd5-9139-fddcf4683c0c	90d19c5c-f80b-454b-b776-99d13f01617e.FPKM.txt.gz	52602c237b87295db93a9dc78264023c	511738	released
b6e34218-c074-4076-af02-9518ac0c8a9b	c443bbe4-b053-47fa-a7a9-97457e44e2a0.FPKM.txt.gz	f6235c9d55f8bedc5cb9baa8d9e26009	509928	released
fe16c6ef-a6c1-44bb-984c-b6c249712707	7bcc02ad-a4c7-44e1-8654-10c4598f936d.FPKM.txt.gz	7af16936f4213f64c9758fc413e3cb5f	571366	released
2b306b09-4509-49c9-81de-df73294360af	26e8feed-d934-4dbc-8b04-d771b2a65915.FPKM.txt.gz	6819fb4aba17598f95439531f4caf238	531151	released
4fda5b25-c7e3-4123-bc85-464932ad4c5d	fe07bbb2-6d46-44b4-8363-1e1b8d648660.FPKM.txt.gz	0edb1d5681ed466994d5c620aff4d50c	525201	released
2e026464-05a8-41f5-a6f0-103c57beaf86	7e89a428-9132-4efa-b119-10241f0ecdcc.FPKM.txt.gz	d6324d0864c798bd6330c16c8474b6a7	513951	released
c0019c36-8416-4f77-9680-f105

In [34]:
len(response.content.decode("utf-8"))

164997

In [35]:
with open("manifest.txt","w") as manifest:
    manifest.write(response.content.decode("utf-8"))
    manifest.close()

## Files

In [36]:
params = {
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "15000"
    }
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
with open("files.txt","w") as files:
    files.write(response.content.decode("utf-8"))
    files.close()

In [37]:
df_files = pd.read_csv("files.txt", sep='\t')
df_files.drop("id", axis=1, inplace=True)
df_files.set_index("file_name", inplace=True)
#df_files.columns=['primary_site','tcga_id','disease_type','primary_diagnosis']
#df_files=df_files.reindex(columns=['primary_site','disease_type','primary_diagnosis','case_id'], copy=False)
df_files.head()

Unnamed: 0_level_0,cases.0.diagnoses.0.tumor_grade,cases.0.project.primary_site,cases.0.project.disease_type,cases.0.diagnoses.0.progression_or_recurrence,cases.0.diagnoses.0.tumor_stage,cases.0.submitter_id,cases.0.samples.0.portions.0.analytes.0.aliquots.0.submitter_id
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
22c16f46-950f-4389-bb16-b5cec4d8b94e.FPKM.txt.gz,not reported,Breast,Breast Invasive Carcinoma,not reported,stage iia,TCGA-A2-A0T5,TCGA-A2-A0T5-01A-21R-A084-07
90d19c5c-f80b-454b-b776-99d13f01617e.FPKM.txt.gz,not reported,Breast,Breast Invasive Carcinoma,not reported,stage ia,TCGA-A2-A0EM,TCGA-A2-A0EM-01A-11R-A034-07
c443bbe4-b053-47fa-a7a9-97457e44e2a0.FPKM.txt.gz,not reported,Breast,Breast Invasive Carcinoma,not reported,stage iib,TCGA-BH-A0BJ,TCGA-BH-A0BJ-01A-11R-A056-07
7bcc02ad-a4c7-44e1-8654-10c4598f936d.FPKM.txt.gz,not reported,Breast,Breast Invasive Carcinoma,not reported,stage ia,TCGA-A7-A0DC,TCGA-A7-A0DC-01B-04R-A22O-07
26e8feed-d934-4dbc-8b04-d771b2a65915.FPKM.txt.gz,not reported,Breast,Breast Invasive Carcinoma,not reported,stage iia,TCGA-D8-A146,TCGA-D8-A146-01A-31R-A115-07


In [38]:
df_files['cases.0.diagnoses.0.progression_or_recurrence'].unique()

array(['not reported', nan], dtype=object)

In [29]:
df_files.to_csv("files.dat", header=True)

In [None]:
params = {
    "filters": json.dumps(filters),
    "fields": "primary_site,disease_type,files.cases.demographic.vital_status,diagnoses.vital_status",
    "format": "TSV",
    "size": "10"
    }
response = requests.get("https://api.gdc.cancer.gov/cases", headers = {"Content-Type": "application/json"}, params = params)
with open("files.txt","w") as files:
    files.write(response.content.decode("utf-8"))
    files.close()