In [206]:
import requests
import json
import pandas as pd

In [151]:
files_endpt = "https://api.gdc.cancer.gov/files"

# The 'fields' parameter is passed as a comma-separated string of single names.
fields = [
    "file_name",
    #"analysis.workflow_type",
    #"experimental_strategy",
    "cases.project.primary_site",
    "cases.project.disease_type",
    "cases.diagnoses.primary_diagnosis",
    "cases.submitter_id",
    "cases.diagnoses.vital_status",
    "cases.diagnoses.days_to_birth"
    #"cases.project.project_id"
]

fields = ','.join(fields)

In [178]:
#miRNA filters
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["miRNA Expression Quantification"]
            }
        },
          {
        "op": "in",
        "content":{
               "field": "files.experimental_strategy",
                "value": ["miRNA-Seq"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.project_id",
            "value": ["TCGA-BRCA"]
            }
        }
        
    ]
}

In [171]:
# Tissue project filters
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["Gene Expression Quantification"]
            }
        },
        {
        "op": "in",
        "content":{
               "field": "files.analysis.workflow_type",
                "value": ["HTSeq - Counts"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.primary_site",
            "value": ['Brain', 'Breast', 'Kidney', 'Lung', 'Thyroid', 'Uterus',
       'Prostate', 'Ovary', 'Lymph Nodes', 'Soft Tissue', 'Esophagus',
       'Stomach', 'Bone Marrow']
            }
        }
        
    ]
}

In [152]:
# all files RNA-seq
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["Gene Expression Quantification"]
            }
        },
        {
        "op": "in",
        "content":{
               "field": "files.analysis.workflow_type",
                "value": ["HTSeq - Counts"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.program.name",
            "value": ["TCGA"]
            }
        }
        
    ]
}

In [164]:
# A POST is used, so the filter parameters can be passed directly as a Dict object.
##remove manifest to have file fields
params = {
   "return_type": "manifest",
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "15000"
    }

In [165]:
# The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)

print(response.content.decode("utf-8"))

id	filename	md5	size	state
f0c94748-968a-4308-a95f-aaec4b5051c0	014ac73d-9fdb-43f9-8495-8dab01f24859.mirbase21.mirnas.quantification.txt	cc75dbb5b0096a256bd0d89f771363da	50130	released
2db0a3fd-2399-4bfb-b676-2b19c197f391	a6d451b4-086a-446b-9048-11ececd8ba66.mirbase21.mirnas.quantification.txt	e9845a671254e0afabedb5368d57e97b	50150	released
b9bb9e55-ac52-4966-8dfa-879cefbc2cb2	52829b22-1c4e-4874-8062-51ed7bb1b997.mirbase21.mirnas.quantification.txt	f0c4d8baec472fd40506990e0825925f	50209	released
a6996338-7dab-4816-898a-a20430141131	0ba140b1-a028-46ee-8d80-ec4795274fe8.mirbase21.mirnas.quantification.txt	9e12e8e020d417a095ecbc874b45ab34	50484	released
8753cc4b-0f5d-409a-a3b2-acf39fc7358b	130cd04a-2901-4607-a539-0e1fe966e367.mirbase21.mirnas.quantification.txt	25b64529dfacdcddc1b83dc9963a757e	50291	released
6ab1f388-28e9-4b2c-bbdf-8383ef18b83c	b2279a60-8da0-4184-924a-ed810d00a751.mirbase21.mirnas.quantification.txt	efa132b358f4c4f7ab01341513f93714	50379	released
7843d077-04e2-41d2-bd64-7

In [166]:
len(response.content.decode("utf-8"))

190733

In [167]:
with open("manifest.txt","w") as manifest:
    manifest.write(response.content.decode("utf-8"))
    manifest.close()

## Files

In [179]:
params = {
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "10000"
    }
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
with open("files.txt","w") as files:
    files.write(response.content.decode("utf-8"))
    files.close()

In [180]:
df_files = pd.read_csv("files.txt", sep='\t')
df_files.drop("id", axis=1, inplace=True)
df_files.set_index("file_name", inplace=True)
df_files.columns=['primary_site','tcga_id','disease_type','primary_diagnosis']
#df_files=df_files.reindex(columns=['primary_site','disease_type','primary_diagnosis','case_id'], copy=False)
df_files.head()

Unnamed: 0_level_0,primary_site,tcga_id,disease_type,primary_diagnosis
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
014ac73d-9fdb-43f9-8495-8dab01f24859.mirbase21.mirnas.quantification.txt,Breast,TCGA-A7-A26F,Breast Invasive Carcinoma,"Metaplastic carcinoma, NOS"
a6d451b4-086a-446b-9048-11ececd8ba66.mirbase21.mirnas.quantification.txt,Breast,TCGA-E2-A158,Breast Invasive Carcinoma,"Infiltrating duct carcinoma, NOS"
52829b22-1c4e-4874-8062-51ed7bb1b997.mirbase21.mirnas.quantification.txt,Breast,TCGA-E9-A1R2,Breast Invasive Carcinoma,"Infiltrating duct carcinoma, NOS"
0ba140b1-a028-46ee-8d80-ec4795274fe8.mirbase21.mirnas.quantification.txt,Breast,TCGA-BH-A201,Breast Invasive Carcinoma,"Infiltrating duct carcinoma, NOS"
130cd04a-2901-4607-a539-0e1fe966e367.mirbase21.mirnas.quantification.txt,Breast,TCGA-D8-A1JC,Breast Invasive Carcinoma,"Infiltrating duct carcinoma, NOS"


In [181]:
df_files.to_csv("files.dat", header=True)

In [None]:
df_files.columns

In [149]:
params = {
    "filters": json.dumps(filters),
    "fields": "primary_site,disease_type,files.cases.demographic.vital_status,diagnoses.vital_status",
    "format": "TSV",
    "size": "10"
    }
response = requests.get("https://api.gdc.cancer.gov/cases", headers = {"Content-Type": "application/json"}, params = params)
with open("files.txt","w") as files:
    files.write(response.content.decode("utf-8"))
    files.close()

In [150]:
arr=[row.split("\t") for row in response.text.split("\r\n")]
pd.DataFrame(data=arr[1:], columns=arr[0]).dropna().set_index('id')

Unnamed: 0_level_0,primary_site,disease_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1
fdf83fdf-dfbb-4306-9a1b-b4487d18b402,Ovary,"Cystic, Mucinous and Serous Neoplasms"
71faa2c1-0d5b-4dcc-bdf9-f2405f29907c,Ovary,"Cystic, Mucinous and Serous Neoplasms"
07fbdb3c-8337-4319-8255-f2363f8a031e,Ovary,"Cystic, Mucinous and Serous Neoplasms"
e978a457-92ff-4eab-b759-ec6e74d973e8,Ovary,"Cystic, Mucinous and Serous Neoplasms"
46395c3d-5e51-477f-946e-e58b87cc7baa,Ovary,"Cystic, Mucinous and Serous Neoplasms"
13f5814c-1f99-4ffa-84a4-3bbd8979faae,Ovary,"Cystic, Mucinous and Serous Neoplasms"
9bbc01b4-056c-4ddc-aca4-ff20718646d0,Ovary,"Cystic, Mucinous and Serous Neoplasms"
c183d3fb-2eee-44f8-890e-b9bf907141e6,Ovary,"Cystic, Mucinous and Serous Neoplasms"
1d192835-524e-429d-bf74-3c4727acb446,Ovary,"Cystic, Mucinous and Serous Neoplasms"
56a30462-2819-4c18-95be-8e73880a4921,Ovary,"Cystic, Mucinous and Serous Neoplasms"
