In [1]:
import requests
import json
import pandas as pd

In [2]:
files_endpt = "https://api.gdc.cancer.gov/files"

# The 'fields' parameter is passed as a comma-separated string of single names.
fields = [
    "file_name",
    #"analysis.workflow_type",
    #"experimental_strategy",
    "cases.project.primary_site",
    "cases.project.disease_type",
    "cases.diagnoses.primary_diagnosis",
    "cases.submitter_id",
    "cases.samples.portions.analytes.aliquots.submitter_id",
    "cases.diagnoses.tumor_stage",
    "cases.diagnoses.tumor_grade",
    "cases.diagnoses.progression_or_recurrence"
    "cases.diagnoses.vital_status",
    "cases.diagnoses.days_to_birth",
    "cases.diagnoses.days_to_death",
    "cases.diagnoses.morphology",
    "cases.diagnoses.tissue_or_organ_of_origin",
    "cases.samples.longest_dimension"
    #"cases.project.project_id"
]

fields = ','.join(fields)

In [None]:
#miRNA filters
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["miRNA Expression Quantification"]
            }
        },
          {
        "op": "in",
        "content":{
               "field": "files.experimental_strategy",
                "value": ["miRNA-Seq"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.primary_site",
            "value": ['Brain', 'Breast', 'Kidney', 'Lung', 'Thyroid', 'Uterus',
       'Prostate', 'Ovary', 'Lymph Nodes', 'Soft Tissue', 'Esophagus',
       'Stomach', 'Bone Marrow']
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.program.name",
            "value": ["TCGA"]
            }
        }
        
    ]
}

In [24]:
# Tissue project filters
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["Gene Expression Quantification"]
            }
        },
        {
        "op": "in",
        "content":{
               "field": "files.analysis.workflow_type",
                "value": ["HTSeq - FPKM"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.project_id",
            "value": ['TCGA-LUSC', 'TCGA-LUAD']
            }
        }
        
    ]
}

In [4]:
# all files RNA-seq
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["Gene Expression Quantification"]
            }
        },
        {
        "op": "in",
        "content":{
               "field": "files.analysis.workflow_type",
                "value": ["HTSeq - FPKM"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.program.name",
            "value": ["TCGA"]
            }
        }
        
    ]
}

In [25]:
# A POST is used, so the filter parameters can be passed directly as a Dict object.
##remove manifest to have file fields
params = {
   "return_type": "manifest",
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "15000"
    }

In [26]:
# The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)

print(response.content.decode("utf-8"))

id	filename	md5	size	state
26311e43-dfa3-4b45-b337-58cbe08e484f	1435d047-8a6f-4bda-a658-07d6ba071300.FPKM.txt.gz	3f00a3b870c02b0750d620e3ee16b7a9	532743	released
6bf5b079-6d78-4ef5-b225-3f5493a751f4	73848cca-19bf-4de1-90ea-1725f617c550.FPKM.txt.gz	f1c856a35cf66e42ae9426dd164166e9	533044	released
b701206a-db47-4c5f-9825-8eebc831d1bc	ff47bedc-e4c7-4775-8e8e-ed4d2eecda83.FPKM.txt.gz	5ee0814a0e51742a1f55aa9d661db03f	534936	released
810bed03-efda-43e9-954b-e78fc995c7c8	7844b74a-2846-4f42-8874-edcc90261fef.FPKM.txt.gz	451ebb87a1c47f933d28d3b2c115fb30	558295	released
55a58234-84d1-4fd1-b314-f9301ae92f34	f9415abb-ec64-4d0f-ad7e-26f2071b2c24.FPKM.txt.gz	abc059472fcbf497f4fe536229085867	533464	released
9b52a4c6-787a-43b4-8528-6ba084d693a4	fe8a279b-cfbb-4a96-9e30-5589bddb7911.FPKM.txt.gz	34d2d4b1ba57621452a38aa0b3dc8643	521694	released
621e56b2-5669-4dbd-b6df-b0d40c994a7d	1d381b27-88c0-4ddd-87cc-31fbb24525c0.FPKM.txt.gz	9f9d4472144b08546106dd3ac1c27a57	536318	released
4ff04c8c-70f8-4c08-b4df-1136

In [27]:
len(response.content.decode("utf-8"))

154602

In [28]:
with open("manifest.txt","w") as manifest:
    manifest.write(response.content.decode("utf-8"))
    manifest.close()

## Files

In [29]:
params = {
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "15000"
    }
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
with open("files.txt","w") as files:
    files.write(response.content.decode("utf-8"))

In [30]:
df_files = pd.read_csv("files.txt", sep='\t')
df_files.drop("id", axis=1, inplace=True)
df_files.set_index("file_name", inplace=True)
#df_files.columns=['primary_site','tcga_id','disease_type','primary_diagnosis']
#df_files=df_files.reindex(columns=['primary_site','disease_type','primary_diagnosis','case_id'], copy=False)
df_files.head()

Unnamed: 0_level_0,cases.0.diagnoses.0.morphology,cases.0.diagnoses.0.primary_diagnosis,cases.0.diagnoses.0.tissue_or_organ_of_origin,cases.0.diagnoses.0.tumor_grade,cases.0.diagnoses.0.tumor_stage,cases.0.project.disease_type,cases.0.project.primary_site,cases.0.samples.0.longest_dimension,cases.0.samples.0.portions.0.analytes.0.aliquots.0.submitter_id,cases.0.submitter_id
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1435d047-8a6f-4bda-a658-07d6ba071300.FPKM.txt.gz,8072/3,"Squamous cell carcinoma, large cell, nonkerati...","Lower lobe, lung",not reported,stage ib,Lung Squamous Cell Carcinoma,Lung,,TCGA-85-A4PA-01A-11R-A24Z-07,TCGA-85-A4PA
73848cca-19bf-4de1-90ea-1725f617c550.FPKM.txt.gz,8070/3,"Squamous cell carcinoma, NOS","Lower lobe, lung",not reported,stage ia,Lung Squamous Cell Carcinoma,Lung,1.0,TCGA-56-8309-01A-11R-2296-07,TCGA-56-8309
ff47bedc-e4c7-4775-8e8e-ed4d2eecda83.FPKM.txt.gz,8071/3,"Squamous cell carcinoma, keratinizing, NOS","Lower lobe, lung",not reported,stage iiib,Lung Squamous Cell Carcinoma,Lung,,TCGA-NC-A5HF-01A-11R-A26W-07,TCGA-NC-A5HF
7844b74a-2846-4f42-8874-edcc90261fef.FPKM.txt.gz,8140/3,"Adenocarcinoma, NOS","Upper lobe, lung",not reported,stage ia,Lung Adenocarcinoma,Lung,2.0,TCGA-44-3918-01A-01R-A278-07,TCGA-44-3918
f9415abb-ec64-4d0f-ad7e-26f2071b2c24.FPKM.txt.gz,8070/3,"Squamous cell carcinoma, NOS","Upper lobe, lung",not reported,stage ib,Lung Squamous Cell Carcinoma,Lung,,TCGA-63-A5MR-01A-31R-A27Q-07,TCGA-63-A5MR


In [16]:
df_files['cases.0.diagnoses.0.morphology'].unique()

array(['9440/3', '8130/3', '8500/3', '8260/3', '8340/3', '9061/3',
       '8140/3', '8072/3', '8070/3', '8071/3', '8480/3', '8720/3',
       '8290/3', '8441/3', '8380/3', '8520/3', '8584/1', '8586/3',
       '8581/1', '8581/3', '9070/3', '9382/3', '9400/3', '8575/3',
       '8310/3', nan, '8582/3', '8584/3', '8252/3', '9450/3', '9451/3',
       '8482/3', '8461/3', '8083/3', '8344/3', '8811/3', '8854/3',
       '8858/3', '8255/3', '8253/3', '8550/3', '8805/3', '8890/3',
       '8770/3', '8721/3', '8230/3', '8145/3', '8120/3', '8170/3',
       '9401/3', '8384/3', '8211/3', '8490/3', '8700/0', '8522/3',
       '8523/3', '8246/3', '8317/3', '8510/3', '8174/3', '8950/3',
       '8370/3', '9052/3', '9053/3', '8180/3', '8583/3', '9861/3',
       '9085/3', '8700/3', '8680/1', '8503/3', '8585/3', '8013/3',
       '8693/1', '8560/3', '8680/3', '8771/3', '8742/3', '8693/3',
       '8460/3', '9680/3', '8265/3', '9080/0', '8524/3', '8263/3',
       '8582/1', '8774/3', '8144/3', '8370/1', '8896/3', 

In [31]:
df_files.to_csv("files.dat", header=True)

In [None]:
params = {
    "filters": json.dumps(filters),
    "fields": "primary_site,disease_type,files.cases.demographic.vital_status,diagnoses.vital_status",
    "format": "TSV",
    "size": "10"
    }
response = requests.get("https://api.gdc.cancer.gov/cases", headers = {"Content-Type": "application/json"}, params = params)
with open("files.txt","w") as files:
    files.write(response.content.decode("utf-8"))
    files.close()