In [1]:
import logging

import gcsfs
import pyarrow as pa
import pyarrow.csv
import pyarrow.dataset as ds
from google.cloud import bigquery  # , storage
from pyarrow.fs import FSSpecHandler, PyFileSystem

In [2]:
logging.getLogger("gcsfs").setLevel("DEBUG")
logging.getLogger("google.cloud.bigquery").setLevel("DEBUG")
logging.getLogger(__name__).setLevel("DEBUG")

handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logging.getLogger().addHandler(handler)

In [3]:
bq_client = bigquery.Client()

In [4]:
query_hg19 = """
SELECT sample_barcode, data_category, file_type, data_format, file_name_key, file_gdc_id
FROM `isb-cgc-bq.TCGA.per_sample_file_metadata_hg19_gdc_current`
where project_short_name = "TCGA-SKCM"
    and experimental_strategy = "RNA-Seq"
    and sample_type_name = "Metastatic"
    and data_format = "TXT"
    and file_name_key like "%rsem.genes.results"
order by sample_barcode
;
"""

file_metadata_hg19 = bq_client.query(query_hg19).to_dataframe(progress_bar_type="tqdm")

2022-04-13 01:45:51,610 - google.cloud.bigquery.opentelemetry_tracing - DEBUG - This service is instrumented using OpenTelemetry. OpenTelemetry or one of its components could not be imported; please add compatible versions of opentelemetry-api and opentelemetry-instrumentation packages in order to get BigQuery Tracing data.
Query complete after 0.29s: 100%|██████████| 1/1 [00:00<00:00,  3.53query/s]
Downloading:   0%|          | 0/368 [00:00<?, ?rows/s]2022-04-13 01:45:53,007 - google.cloud.bigquery._pandas_helpers - DEBUG - Started reading table 'keen-dispatch-316219._37d4cdf3349bb7a5059aac014cff35ac3e47f49a.anonfffb76591271d437f6c19699caa9bc79a0337745' with BQ Storage API session 'projects/keen-dispatch-316219/locations/us/sessions/CAISDHRmd0dvRHA0VExpRxoCamQaAmly'.
Downloading: 100%|██████████| 368/368 [00:01<00:00, 280.76rows/s]


In [6]:
file_metadata_hg19

Unnamed: 0,sample_barcode,data_category,file_type,data_format,file_name_key,file_gdc_id
0,TCGA-3N-A9WB-06A,Gene expression,file,TXT,gs://gdc-tcga-phs000178-open/0b8b258e-1671-4f8...,0b8b258e-1671-4f86-82e7-59b12ad40d9c
1,TCGA-3N-A9WC-06A,Gene expression,file,TXT,gs://gdc-tcga-phs000178-open/c8ee8367-c529-4dd...,c8ee8367-c529-4dd6-98b4-fde57991134b
2,TCGA-3N-A9WD-06A,Gene expression,file,TXT,gs://gdc-tcga-phs000178-open/78354f8d-5ce8-461...,78354f8d-5ce8-4617-bba4-79614f232e97
3,TCGA-BF-AAP0-06A,Gene expression,file,TXT,gs://gdc-tcga-phs000178-open/55b07297-ac13-428...,55b07297-ac13-428d-9aa9-5349f6d3b444
4,TCGA-D3-A1Q1-06A,Gene expression,file,TXT,gs://gdc-tcga-phs000178-open/bc296bbf-3bd5-4cc...,bc296bbf-3bd5-4ccf-a277-aea5a6454fbd
...,...,...,...,...,...,...
363,TCGA-YG-AA3O-06A,Gene expression,file,TXT,gs://gdc-tcga-phs000178-open/87babc42-ce66-4a3...,87babc42-ce66-4a31-9195-ca824bc6f715
364,TCGA-YG-AA3P-06A,Gene expression,file,TXT,gs://gdc-tcga-phs000178-open/ac890ffb-a4ca-460...,ac890ffb-a4ca-4606-a1d3-eb2c2531317f
365,TCGA-Z2-A8RT-06A,Gene expression,file,TXT,gs://gdc-tcga-phs000178-open/12c644b3-6040-490...,12c644b3-6040-490d-9129-e927fca317b5
366,TCGA-Z2-AA3S-06A,Gene expression,file,TXT,gs://gdc-tcga-phs000178-open/25469262-4e2b-439...,25469262-4e2b-4399-bde8-d9d17f3f6a02


In [7]:
schema = pa.schema(
    [
        # dummy comment to prevent formatting
        ("file_gdc_id", pa.string()),
        ("gene_id", pa.string()),
        ("raw_count", pa.float32()),
        ("scaled_estimate", pa.float32()),
    ]
)

In [8]:
fs = gcsfs.GCSFileSystem()

2022-04-13 01:46:06,307 - gcsfs.credentials - DEBUG - Connected with method google_default


In [9]:
df = file_metadata_hg19

In [10]:
paths = df["file_name_key"].tolist()

In [11]:
# df.apply(lambda x: ds.field("file_gdc_id") == x["file_gdc_id"], axis=1).tolist()
pyarrow_partition_expressions = df["file_gdc_id"].apply(lambda x: ds.field("file_gdc_id") == x).tolist()

In [12]:
dataset_from_paths = ds.FileSystemDataset.from_paths(
    paths,
    schema,
    partitions=pyarrow_partition_expressions,
    format=ds.CsvFileFormat(parse_options=pa.csv.ParseOptions(delimiter="\t")),
    filesystem=PyFileSystem(FSSpecHandler(fs)),
)

In [13]:
%%time
table = dataset_from_paths.to_table()

2022-04-13 01:46:18,875 - gcsfs - DEBUG - GET: b/{}/o/{}, ('gdc-tcga-phs000178-open', '0b8b258e-1671-4f86-82e7-59b12ad40d9c/unc.edu.4c243ea9-dfe1-42f0-a887-3c901fb38542.2477720.rsem.genes.results'), None
2022-04-13 01:46:18,878 - gcsfs.credentials - DEBUG - GCS refresh
2022-04-13 01:46:18,929 - gcsfs - DEBUG - GET: b/{}/o/{}, ('gdc-tcga-phs000178-open', '0b8b258e-1671-4f86-82e7-59b12ad40d9c/unc.edu.4c243ea9-dfe1-42f0-a887-3c901fb38542.2477720.rsem.genes.results'), None
2022-04-13 01:46:18,959 - gcsfs - DEBUG - GET: b/{}/o/{}, ('gdc-tcga-phs000178-open', 'c8ee8367-c529-4dd6-98b4-fde57991134b/unc.edu.a64ae1f5-a189-4173-be13-903bd7637869.2476757.rsem.genes.results'), None
2022-04-13 01:46:18,961 - gcsfs - DEBUG - GET: https://storage.googleapis.com/download/storage/v1/b/gdc-tcga-phs000178-open/o/0b8b258e-1671-4f86-82e7-59b12ad40d9c%2Func.edu.4c243ea9-dfe1-42f0-a887-3c901fb38542.2477720.rsem.genes.results?alt=media, (), {'Range': 'bytes=0-1500491'}
2022-04-13 01:46:19,008 - gcsfs - DEBUG -

CPU times: user 6.86 s, sys: 1.1 s, total: 7.96 s
Wall time: 14.6 s


In [15]:
table.take(list(range(10))).to_pandas()

Unnamed: 0,file_gdc_id,gene_id,raw_count,scaled_estimate
0,0b8b258e-1671-4f86-82e7-59b12ad40d9c,?|100130426,0.0,0.0
1,0b8b258e-1671-4f86-82e7-59b12ad40d9c,?|100133144,11.07,2.887527e-07
2,0b8b258e-1671-4f86-82e7-59b12ad40d9c,?|100134869,15.93,3.024025e-07
3,0b8b258e-1671-4f86-82e7-59b12ad40d9c,?|10357,355.480011,2.085068e-05
4,0b8b258e-1671-4f86-82e7-59b12ad40d9c,?|10431,3032.0,8.241797e-05
5,0b8b258e-1671-4f86-82e7-59b12ad40d9c,?|136542,0.0,0.0
6,0b8b258e-1671-4f86-82e7-59b12ad40d9c,?|155060,274.0,2.776287e-06
7,0b8b258e-1671-4f86-82e7-59b12ad40d9c,?|26823,2.0,1.458671e-07
8,0b8b258e-1671-4f86-82e7-59b12ad40d9c,?|280660,0.0,0.0
9,0b8b258e-1671-4f86-82e7-59b12ad40d9c,?|317712,0.0,0.0


In [17]:
query_hg38 = """
SELECT sample_barcode, data_category, file_type, data_format, file_name_key, file_gdc_id
FROM `isb-cgc-bq.TCGA.per_sample_file_metadata_hg38_gdc_current`
where project_short_name = "TCGA-SKCM"
    and experimental_strategy = "RNA-Seq"
    and sample_type_name = "Metastatic"
    and data_format = "TXT"
    and file_name_key like "%FPKM.txt.gz"
order by sample_barcode
;
"""

query_job = bq_client.query(query_hg38)
file_metadata_hg38 = query_job.to_dataframe(progress_bar_type="tqdm")

Query complete after 0.61s: 100%|██████████| 1/1 [00:00<00:00,  1.65query/s]                                  
Downloading:   0%|          | 0/367 [00:00<?, ?rows/s]2022-04-13 01:47:58,893 - google.cloud.bigquery._pandas_helpers - DEBUG - Started reading table 'keen-dispatch-316219._37d4cdf3349bb7a5059aac014cff35ac3e47f49a.anon44539dff6ba8401a62c755bfc2a6678dbe8e4990' with BQ Storage API session 'projects/keen-dispatch-316219/locations/us/sessions/CAISDEFZWXg2Vjc2SGs0ShoCamQaAmly'.
Downloading: 100%|██████████| 367/367 [00:01<00:00, 259.33rows/s]


In [18]:
file_metadata_hg38

Unnamed: 0,sample_barcode,data_category,file_type,data_format,file_name_key,file_gdc_id
0,TCGA-3N-A9WB-06A,Transcriptome Profiling,gene_expression,TXT,gs://gdc-tcga-phs000178-open/6f0f09df-cb9e-4ee...,6f0f09df-cb9e-4ee6-9f0d-e0c7bb2aa001
1,TCGA-3N-A9WC-06A,Transcriptome Profiling,gene_expression,TXT,gs://gdc-tcga-phs000178-open/571093c2-7bb3-4fb...,571093c2-7bb3-4fb9-8b91-84a9a438bdda
2,TCGA-3N-A9WD-06A,Transcriptome Profiling,gene_expression,TXT,gs://gdc-tcga-phs000178-open/0610b28d-8712-4ad...,0610b28d-8712-4ad0-a7f8-b9d9e4590cb8
3,TCGA-BF-AAP0-06A,Transcriptome Profiling,gene_expression,TXT,gs://gdc-tcga-phs000178-open/accf05f6-f841-43a...,accf05f6-f841-43ad-8d11-0568c0a77a2b
4,TCGA-D3-A1Q1-06A,Transcriptome Profiling,gene_expression,TXT,gs://gdc-tcga-phs000178-open/2507b44b-1dab-436...,2507b44b-1dab-436c-a7dc-be74b93ed978
...,...,...,...,...,...,...
362,TCGA-YG-AA3O-06A,Transcriptome Profiling,gene_expression,TXT,gs://gdc-tcga-phs000178-open/58fd2219-6153-48d...,58fd2219-6153-48dd-82e3-9fdfcc2f930d
363,TCGA-YG-AA3P-06A,Transcriptome Profiling,gene_expression,TXT,gs://gdc-tcga-phs000178-open/9533e842-d47f-476...,9533e842-d47f-4766-8fbc-29a1e8d3f364
364,TCGA-Z2-A8RT-06A,Transcriptome Profiling,gene_expression,TXT,gs://gdc-tcga-phs000178-open/f4824934-5951-493...,f4824934-5951-4938-851d-7223c0f54890
365,TCGA-Z2-AA3S-06A,Transcriptome Profiling,gene_expression,TXT,gs://gdc-tcga-phs000178-open/a50ae196-9baa-4eb...,a50ae196-9baa-4ebd-ab38-dc4388f7e221


In [None]:
pd.merge(left=file_metadata_hg19, right=file_metadata_hg38, how="inner", on=["sample_barcode"], suffixes=["_hg19", "_hg38"])

In [None]:
import gcsfs

In [None]:
fs = gcsfs.GCSFileSystem()

In [None]:
# fs.ls("gs://liulab/")
fs.ls("gs://gdc-tcga-phs000178-open/c8ee8367-c529-4dd6-98b4-fde57991134b")

## pyarrow

In [None]:
y = x.to_table()

In [None]:
y

In [None]:
z = y.group_by("gene_id")

In [None]:
z.aggregate([("scaled_estimate", "mean"), ("scaled_estimate", "stddev")]).to_pandas().set_index("gene_id")

In [None]:
import pandas as pd

In [None]:
# uri = "gs://gdc-tcga-phs000178-open/0b8b258e-1671-4f86-82e7-59b12ad40d9c/unc.edu.4c243ea9-dfe1-42f0-a887-3c901fb38542.2477720.rsem.genes.results"
# uri = "gs://gdc-tcga-phs000178-open/*/*.rsem.genes.results"

uri = [
    "gs://gdc-tcga-phs000178-open/0b8b258e-1671-4f86-82e7-59b12ad40d9c/unc.edu.4c243ea9-dfe1-42f0-a887-3c901fb38542.2477720.rsem.genes.results",
    "gs://gdc-tcga-phs000178-open/0b8b258e-1671-4f86-82e7-59b12ad40d9c/unc.edu.4c243ea9-dfe1-42f0-a887-3c901fb38542.2477720.rsem.genes.results",
]

pd.read_csv(
    uri[0],
    nrows=10,
    sep="\t",
)

In [None]:
import pyarrow as pa
import pyarrow.dataset as ds

In [None]:
gdc_tcga_bucket = ""

# dataset = ds.dataset("