In [3]:
import json
import requests
import pandas as pd

FILES = "https://api.gdc.cancer.gov/files"

filters = {
  "op": "and",
  "content": [
    {"op":"in","content":{"field":"cases.project.project_id","value":["TCGA-DLBC"]}},
    {"op":"in","content":{"field":"data_category","value":["Transcriptome Profiling"]}},
    {"op":"in","content":{"field":"data_type","value":["Gene Expression Quantification"]}},
    {"op":"in","content":{"field":"analysis.workflow_type","value":["STAR - Counts"]}}
  ]
}

params = {
  "filters": json.dumps(filters),
  "fields": ",".join([
    "file_id","file_name","md5sum","file_size",
    "cases.submitter_id","cases.samples.sample_type",
    "analysis.workflow_type"
  ]),
  "format": "JSON",
  "size": "2000"
}

r = requests.get(FILES, params=params, timeout=60)
r.raise_for_status()
hits = r.json()["data"]["hits"]

df = pd.json_normalize(hits)

# Build a GDC manifest (minimum required columns)
manifest = df[["file_id","file_name","md5sum","file_size"]].copy()
manifest.columns = ["id","filename","md5","size"]

manifest_path = "manifests/tcga_dlbc_star_counts_manifest.tsv"
manifest.to_csv(
    "../manifests/tcga_dlbc_star_counts_manifest.tsv",
    sep="\t",
    index=False
)

len(manifest), manifest_path


(48, 'manifests/tcga_dlbc_star_counts_manifest.tsv')