In [1]:
import requests
import json
import sys
import pandas as pd
from io import StringIO

In [2]:
primary_site = "breast"
sample_type = "primary tumor"

In [3]:
fields = [
    "cases.case_id",
    "annotations.entity_id",
    "file_name"
    ]

fields = ",".join(fields)

In [4]:
files_endpt = "https://api.gdc.cancer.gov/files"
manifest_endpt = "https://api.gdc.cancer.gov/manifest"

In [5]:
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "cases.project.primary_site",
            "value": [primary_site]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.samples.sample_type",
            "value": [sample_type]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.analysis.workflow_type",
            "value": ["ASCAT2"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["Gene Level Copy Number"]
            }
        }
    ]
}


In [6]:
params = {
    "filters": json.dumps(filters),
    "fields": fields,
    "format": "tsv",
    "size": "2000"
    }

In [7]:
# Getting ASCAT2 data
response = requests.get(files_endpt, params = params)
data = response.content.decode("utf-8")

In [8]:
df_ascat = pd.read_csv(StringIO(data), sep ="\t")

In [9]:
df_ascat.head()

Unnamed: 0,annotations.0.entity_id,annotations.1.entity_id,cases.0.case_id,file_name,id
0,,,0f64edec-0f1f-4025-8a53-75f9534f7828,TCGA-BRCA.2db50b93-b544-42c8-9c6b-39bcfe870324...,9db19d5d-e73f-4b28-8770-721d54c3d1a1
1,,,8db69f1d-72e2-40ba-ad8a-cf228499840d,TCGA-BRCA.03fd168b-3ada-472c-89a9-33dae1531fc3...,b2308547-14bd-4ea2-9572-27631c4f4b0d
2,,,da70cf7e-0e61-4c72-b4c5-c408569d11b8,TCGA-BRCA.23d1f5b1-6b5f-45f3-b30a-b6b04692ba68...,d662dc08-1ac2-4b2e-bc0a-3268d0f6179e
3,,,4b0d295c-e185-4b52-9752-178e5bc1d47d,TCGA-BRCA.0f68f8f1-514a-41b6-bf5a-87792ef5a50f...,6e458ea0-7e2b-43ef-8108-80369f9d05b6
4,,,2c86c3ea-d926-4d39-a5ae-39ece4774287,TCGA-BRCA.6c71c171-be2a-4eac-bae9-841c1192ab61...,7f6dd6a3-9798-4507-8fba-f3a84002a736


In [23]:
df_ascat.shape

(1077, 5)

In [16]:
df_ascat.loc[df_ascat["annotations.0.entity_id"].notnull(), ]

Unnamed: 0,annotations.0.entity_id,annotations.1.entity_id,cases.0.case_id,file_name,id
52,a9b81daa-0f0b-450a-b02b-b936bddb95e2,,39de7761-e762-4811-b95c-8216b79ae06b,TCGA-BRCA.742b1f3f-a199-4dac-8d56-82db36855633...,9770b998-8816-4d41-8dce-b32f00d044a9
56,6255148d-b5ba-4987-99bb-7d3533409e73,,955d4263-61f7-42e8-8a6e-772a0d6c209d,TCGA-BRCA.e52f1f1f-dfed-49b5-a7f2-f81e0af3c810...,c585518c-6193-40f1-9cfa-f63f41d19373
57,c1cfdf6c-d30b-49c8-9875-d39d65e925b5,,89128dba-403f-4a96-bb3b-23ed0d5e2147,TCGA-BRCA.9ca918d1-dde6-4812-b6b8-05a6fb2332c3...,a4653b72-9c2c-4a90-afdc-d8953a0d53fc
60,0cc10a1e-a90a-4c55-87a2-a74d47474a4a,,1502c7d7-1535-4e56-9f34-30623acd50d5,TCGA-BRCA.89a93ef7-73a7-451f-9c83-251b827f02c6...,d5f1da45-d7df-41ae-941e-00611dccbbc9
92,14b95463-2108-4921-afc2-e29eef52b18f,14b95463-2108-4921-afc2-e29eef52b18f,14b95463-2108-4921-afc2-e29eef52b18f,TCGA-BRCA.0b6fed27-b3fc-4078-a9d3-e766d718c19b...,41c1a603-6565-4b56-99f1-fb8df39c7bcc
...,...,...,...,...,...
1022,5f903bcb-f85c-498c-9ca1-08f3962aded2,,9e3de467-fb0d-4021-89f8-b34968a7c7bc,TCGA-BRCA.101abe3f-d317-4f8b-9b20-042da3d22457...,1d19d9c1-345c-4015-939f-630466fc1eef
1024,3dbe99d1-e3b8-4ee2-b6a8-2e2e12c6fbe9,3dbe99d1-e3b8-4ee2-b6a8-2e2e12c6fbe9,3dbe99d1-e3b8-4ee2-b6a8-2e2e12c6fbe9,TCGA-BRCA.5b2d8ac0-cdb6-486f-bbad-3555c216ab0a...,d134e790-82b5-4e78-817f-46a0e753ae80
1031,02f5ae33-a563-4ecb-9e33-dfa500a44931,,02f5ae33-a563-4ecb-9e33-dfa500a44931,TCGA-BRCA.66c6efd3-406c-4b96-ac62-b570d161c825...,04354a53-dc96-4b45-b841-d2f370d90cfe
1039,44e34ec8-b65b-451c-aea9-89a6537bc689,,398fb71b-ca83-44e7-bf0d-b1ca464b0283,TCGA-BRCA.98c2fa71-63aa-4bfe-b49e-41aa4abdb391...,cecbd337-ad19-46b0-90fa-720a04b8c4fa


In [13]:
df_ascat["annotations.0.entity_id"].isnull()

0        True
1        True
2        True
3        True
4        True
        ...  
1072    False
1073     True
1074     True
1075     True
1076     True
Name: annotations.0.entity_id, Length: 1077, dtype: bool

In [21]:
df_ascat.loc[df_ascat["cases.0.case_id"].duplicated(), ].shape

(10, 5)

In [22]:
filters["content"][2]["content"]["value"] = "HTSeq - Counts"
filters["content"][3]["content"]["value"] = "Gene Expression Quantification"

In [24]:
params = {
    "filters": json.dumps(filters),
    "fields": fields,
    "format": "tsv",
    "size": "2000"
    }

response = requests.get(files_endpt, params = params)
data = response.content.decode("utf-8")

In [25]:
df_rna = pd.read_csv(StringIO(data), sep ="\t")

In [26]:
df_rna.head()

Unnamed: 0,annotations.0.entity_id,annotations.1.entity_id,cases.0.case_id,file_name,id
0,,,65077db7-dd59-4362-9d28-d5a9154e9be0,a8a58442-78f5-4876-b25e-c04339eb6f26.htseq.cou...,dce06602-a4bc-4103-915a-c9475f157006
1,,,a8f5c479-8685-4e2d-bb60-63f1cc651083,3c631c4d-ec26-4f29-abcb-bed2221f3da5.htseq.cou...,5395a29c-2349-44c1-bd68-b3989d4799b2
2,,,5580b21a-2cdb-4777-ad79-6e06654144f5,1c903a79-fbee-47a4-8847-79d9242a2fcf.htseq.cou...,aad765b7-ce4e-4c72-8518-3aff0dced551
3,,,3a711140-1b89-4611-9992-3861e3d0e01a,ca5d0721-99cf-4b7c-b708-6acea2ae86bd.htseq.cou...,0cbfcb99-93ca-46bb-ab4d-3dcccee62cd0
4,,,cdd8c046-fd97-482a-a11f-2217b321c4d1,47af78ae-c41a-401f-a88b-d7e35831f816.htseq.cou...,2edb7e13-2356-4654-a324-6c032ec2acd4


In [27]:
df_rna.shape

(1102, 5)

In [29]:
df_rna.loc[df_rna["annotations.0.entity_id"].notnull(), ].shape

(32, 5)

In [31]:
df_rna.loc[df_rna["cases.0.case_id"].duplicated(), ].shape

(11, 5)

In [36]:
df_rna.drop_duplicates(subset=["cases.0.case_id"]).shape

(1091, 5)

In [34]:
df_rna["cases.0.case_id"].unique()

array(['65077db7-dd59-4362-9d28-d5a9154e9be0',
       'a8f5c479-8685-4e2d-bb60-63f1cc651083',
       '5580b21a-2cdb-4777-ad79-6e06654144f5', ...,
       '994ca1f5-ad10-44ec-aa21-71fc2940653b',
       '955d4263-61f7-42e8-8a6e-772a0d6c209d',
       'f9eb88f7-9293-46bf-ace4-a746e4ff80dc'], dtype=object)