In [1]:
import requests
import json
import pandas as pd

In [2]:
files_endpt = "https://api.gdc.cancer.gov/files"

# The 'fields' parameter is passed as a comma-separated string of single names.
fields = [
    "file_name",
    #"analysis.workflow_type",
    #"experimental_strategy",
    "cases.project.primary_site",
    "cases.project.disease_type",
    "cases.diagnoses.primary_diagnosis",
    "cases.submitter_id",
    "cases.samples.portions.analytes.aliquots.submitter_id",
    "cases.diagnoses.tumor_stage",
    "cases.diagnoses.tumor_grade",
    "cases.diagnoses.progression_or_recurrence"
    "cases.diagnoses.vital_status",
    "cases.diagnoses.days_to_birth",
    "cases.diagnoses.days_to_death",
    "cases.diagnoses.morphology",
    "cases.diagnoses.tissue_or_organ_of_origin",
    "cases.samples.longest_dimension"
    #"cases.project.project_id"
]

fields = ','.join(fields)

In [3]:
#miRNA filters
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["miRNA Expression Quantification"]
            }
        },
          {
        "op": "in",
        "content":{
               "field": "files.experimental_strategy",
                "value": ["miRNA-Seq"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.project_id",
            "value": ['TCGA-BRCA']
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.program.name",
            "value": ["TCGA"]
            }
        }
        
    ]
}

In [None]:
# Tissue project filters
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["Gene Expression Quantification"]
            }
        },
        {
        "op": "in",
        "content":{
               "field": "files.analysis.workflow_type",
                "value": ["HTSeq - FPKM"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.project_id",
            "value": ["TCGA-BRCA"]
            }
        }
        
    ]
}

In [4]:
# A POST is used, so the filter parameters can be passed directly as a Dict object.
##remove manifest to have file fields
params = {
   "return_type": "manifest",
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "15000"
    }

In [5]:
# The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)

#print(response.content.decode("utf-8"))

In [6]:
len(response.content.decode("utf-8"))

190733

In [7]:
with open("manifest.txt","w") as manifest:
    manifest.write(response.content.decode("utf-8"))

## Files

In [None]:
params = {
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "15000"
    }
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
with open("files.txt","w") as files:
    files.write(response.content.decode("utf-8"))

In [None]:
df_files = pd.read_csv("files.txt", sep='\t')
df_files.drop("id", axis=1, inplace=True)
df_files.set_index("file_name", inplace=True)
#df_files.columns=['primary_site','tcga_id','disease_type','primary_diagnosis']
#df_files=df_files.reindex(columns=['primary_site','disease_type','primary_diagnosis','case_id'], copy=False)
df_files.head()

In [None]:
df_files.to_csv("files_fpkm.dat", header=True)

In [None]:
params = {
    "filters": json.dumps(filters),
    "fields": "primary_site,cases.project.project_id,file_name,cases.demographic.vital_status,cases.demographic.gender,cases.diagnoses.age_at_diagnosis,cases.diagnoses.days_to_last_follow_up,cases.demographic.days_to_death,cases.demographic.days_to_birth,cases.submitter_id,samples.portions.analytes.aliquots.submitter_id,cases.diagnoses.last_known_disease_status,cases.diagnoses.tumor_stage,cases.exposures.years_smoked,cases.exposures.cigarettes_per_day,cases.samples.portions.analytes.aliquots.submitter_id",
    "format": "TSV",
    "size": "50000"
    }
response = requests.get("https://api.gdc.cancer.gov/files", headers = {"Content-Type": "application/json"}, params = params)
#print(response.content.decode("UTF-8"))
with open("files.txt","w") as files:
    files.write(response.content.decode("utf-8"))

# Merge

In [8]:
df_messangers = pd.read_csv("files_fpkm.dat")
df_messangers.head(2)

Unnamed: 0,file_name,cases.0.diagnoses.0.morphology,cases.0.diagnoses.0.primary_diagnosis,cases.0.diagnoses.0.tissue_or_organ_of_origin,cases.0.diagnoses.0.tumor_grade,cases.0.diagnoses.0.tumor_stage,cases.0.project.disease_type,cases.0.project.primary_site,cases.0.samples.0.longest_dimension,cases.0.samples.0.portions.0.analytes.0.aliquots.0.submitter_id,cases.0.submitter_id
0,44caf0b5-d05f-49fd-b8ec-c32d0003c5f4.FPKM.txt.gz,8500/3,"Infiltrating duct carcinoma, NOS","Breast, NOS",not reported,stage iia,Breast Invasive Carcinoma,Breast,,TCGA-A2-A1FZ-01A-51R-A14D-07,TCGA-A2-A1FZ
1,de704076-e915-4749-9729-e6ee06a0d359.FPKM.txt.gz,8520/3,"Lobular carcinoma, NOS","Breast, NOS",not reported,stage iiia,Breast Invasive Carcinoma,Breast,,TCGA-LQ-A4E4-01A-11R-A266-07,TCGA-LQ-A4E4


In [9]:
df_miRNA = pd.read_csv("files_miRNA.dat")
df_miRNA.head(2)

Unnamed: 0,file_name,cases.0.diagnoses.0.morphology,cases.0.diagnoses.0.primary_diagnosis,cases.0.diagnoses.0.tissue_or_organ_of_origin,cases.0.diagnoses.0.tumor_grade,cases.0.diagnoses.0.tumor_stage,cases.0.project.disease_type,cases.0.project.primary_site,cases.0.samples.0.longest_dimension,cases.0.samples.0.portions.0.analytes.0.aliquots.0.submitter_id,cases.0.submitter_id
0,732ee0df-c0fa-4fa3-aa77-989a11172285.mirbase21...,8480/3,Mucinous adenocarcinoma,"Breast, NOS",not reported,stage i,Breast Invasive Carcinoma,Breast,,TCGA-A7-A3IY-01A-21R-A21U-13,TCGA-A7-A3IY
1,cd88d4aa-99b6-495e-97fb-1a062aed48a2.mirbase21...,8500/3,"Infiltrating duct carcinoma, NOS","Breast, NOS",not reported,stage iv,Breast Invasive Carcinoma,Breast,,TCGA-A8-A08O-01A-21R-A057-13,TCGA-A8-A08O


In [10]:
df_messangers = df_messangers.loc[~df_messangers["cases.0.submitter_id"].duplicated(keep="first"),:].rename(columns={"file_name":"file_fpkm"})
df_miRNA = df_miRNA.loc[~df_miRNA["cases.0.submitter_id"].duplicated(keep="first"),:].rename(columns={"file_name":"file_miRNA"})

In [11]:
df_files = df_miRNA.set_index("cases.0.submitter_id").transpose().append(df_messangers.set_index("cases.0.submitter_id").transpose()).transpose()
df_files.head(2)

Unnamed: 0_level_0,file_miRNA,cases.0.diagnoses.0.morphology,cases.0.diagnoses.0.primary_diagnosis,cases.0.diagnoses.0.tissue_or_organ_of_origin,cases.0.diagnoses.0.tumor_grade,cases.0.diagnoses.0.tumor_stage,cases.0.project.disease_type,cases.0.project.primary_site,cases.0.samples.0.longest_dimension,cases.0.samples.0.portions.0.analytes.0.aliquots.0.submitter_id,file_fpkm,cases.0.diagnoses.0.morphology,cases.0.diagnoses.0.primary_diagnosis,cases.0.diagnoses.0.tissue_or_organ_of_origin,cases.0.diagnoses.0.tumor_grade,cases.0.diagnoses.0.tumor_stage,cases.0.project.disease_type,cases.0.project.primary_site,cases.0.samples.0.longest_dimension,cases.0.samples.0.portions.0.analytes.0.aliquots.0.submitter_id
cases.0.submitter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
TCGA-A7-A3IY,732ee0df-c0fa-4fa3-aa77-989a11172285.mirbase21...,8480/3,Mucinous adenocarcinoma,"Breast, NOS",not reported,stage i,Breast Invasive Carcinoma,Breast,,TCGA-A7-A3IY-01A-21R-A21U-13,2c9671e1-e001-4565-862a-a4845c9bad4f.FPKM.txt.gz,8480/3,Mucinous adenocarcinoma,"Breast, NOS",not reported,stage i,Breast Invasive Carcinoma,Breast,,TCGA-A7-A3IY-01A-21R-A21T-07
TCGA-A8-A08O,cd88d4aa-99b6-495e-97fb-1a062aed48a2.mirbase21...,8500/3,"Infiltrating duct carcinoma, NOS","Breast, NOS",not reported,stage iv,Breast Invasive Carcinoma,Breast,,TCGA-A8-A08O-01A-21R-A057-13,f6170d9d-53fb-478d-b305-ab3fd5c4ecc8.FPKM.txt.gz,8500/3,"Infiltrating duct carcinoma, NOS","Breast, NOS",not reported,stage iv,Breast Invasive Carcinoma,Breast,,TCGA-A8-A08O-01A-21R-A056-07


In [12]:
df_files.to_csv("files_manifest.dat")

In [13]:
pd.read_csv("data_miRNA/002f73df-6c1d-4187-bab8-09908ed64a75/09a8fbb6-a236-4c64-be34-932db3fd3f06.mirbase21.mirnas.quantification.txt", sep="\t")["miRNA_ID"].to_csv("miRNA.txt", index=False, header=True)

FileNotFoundError: [Errno 2] No such file or directory: 'data_miRNA/002f73df-6c1d-4187-bab8-09908ed64a75/09a8fbb6-a236-4c64-be34-932db3fd3f06.mirbase21.mirnas.quantification.txt'