In [1]:
import logging

import gcsfs
import pyarrow as pa
import pyarrow.csv
import pyarrow.dataset as ds
from google.cloud import bigquery  # , storage
from pyarrow.fs import FSSpecHandler, PyFileSystem

In [2]:
logging.getLogger("gcsfs").setLevel("DEBUG")
logging.getLogger("google.cloud.bigquery").setLevel("DEBUG")
logging.getLogger(__name__).setLevel("DEBUG")

handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logging.getLogger().addHandler(handler)

In [3]:
# bq_client = bigquery.Client()
bq_client = bigquery.Client.from_service_account_json(
    "credentials-william-bigquery-sa.json"
)

In [4]:
query_hg19 = """
SELECT sample_barcode, data_category, file_type, data_format, file_name_key, file_gdc_id
FROM `isb-cgc-bq.TCGA.per_sample_file_metadata_hg19_gdc_current`
where project_short_name = "TCGA-SKCM"
    and experimental_strategy = "RNA-Seq"
    and sample_type_name = "Metastatic"
    and data_format = "TXT"
    and file_name_key like "%rsem.genes.results"
order by sample_barcode
;
"""

file_metadata_hg19 = bq_client.query(query_hg19).to_dataframe(progress_bar_type="tqdm")

Query complete after 0.41s: 100%|██████████████| 1/1 [00:00<00:00,  2.64query/s]
Downloading:   0%|                                    | 0/368 [00:00<?, ?rows/s]2022-04-13 22:22:28,272 - google.cloud.bigquery._pandas_helpers - DEBUG - Started reading table 'keen-dispatch-316219._3390f5fead31fe4257c155ce95c2f1f18d7e0416.anonfffb76591271d437f6c19699caa9bc79a0337745' with BQ Storage API session 'projects/keen-dispatch-316219/locations/us/sessions/CAISDG9YbmM5UzNpQ3BoVhoCamQaAmly'.
Downloading: 100%|█████████████████████████| 368/368 [00:01<00:00, 259.07rows/s]


In [5]:
file_metadata_hg19

Unnamed: 0,sample_barcode,data_category,file_type,data_format,file_name_key,file_gdc_id
0,TCGA-3N-A9WB-06A,Gene expression,file,TXT,gs://gdc-tcga-phs000178-open/0b8b258e-1671-4f8...,0b8b258e-1671-4f86-82e7-59b12ad40d9c
1,TCGA-3N-A9WC-06A,Gene expression,file,TXT,gs://gdc-tcga-phs000178-open/c8ee8367-c529-4dd...,c8ee8367-c529-4dd6-98b4-fde57991134b
2,TCGA-3N-A9WD-06A,Gene expression,file,TXT,gs://gdc-tcga-phs000178-open/78354f8d-5ce8-461...,78354f8d-5ce8-4617-bba4-79614f232e97
3,TCGA-BF-AAP0-06A,Gene expression,file,TXT,gs://gdc-tcga-phs000178-open/55b07297-ac13-428...,55b07297-ac13-428d-9aa9-5349f6d3b444
4,TCGA-D3-A1Q1-06A,Gene expression,file,TXT,gs://gdc-tcga-phs000178-open/bc296bbf-3bd5-4cc...,bc296bbf-3bd5-4ccf-a277-aea5a6454fbd
...,...,...,...,...,...,...
363,TCGA-YG-AA3O-06A,Gene expression,file,TXT,gs://gdc-tcga-phs000178-open/87babc42-ce66-4a3...,87babc42-ce66-4a31-9195-ca824bc6f715
364,TCGA-YG-AA3P-06A,Gene expression,file,TXT,gs://gdc-tcga-phs000178-open/ac890ffb-a4ca-460...,ac890ffb-a4ca-4606-a1d3-eb2c2531317f
365,TCGA-Z2-A8RT-06A,Gene expression,file,TXT,gs://gdc-tcga-phs000178-open/12c644b3-6040-490...,12c644b3-6040-490d-9129-e927fca317b5
366,TCGA-Z2-AA3S-06A,Gene expression,file,TXT,gs://gdc-tcga-phs000178-open/25469262-4e2b-439...,25469262-4e2b-4399-bde8-d9d17f3f6a02


In [28]:
# fs = gcsfs.GCSFileSystem()
fs = gcsfs.GCSFileSystem(
    #
    project="keen-dispatch-316219",
    access="read_only",
    token="credentials-william-bigquery-sa.json",
)

In [32]:
df = file_metadata_hg19.iloc[0:3]
paths = df["file_name_key"].tolist()
schema = pa.schema(
    [
        #
        ("file_gdc_id", pa.string()),
        ("gene_id", pa.string()),
        ("raw_count", pa.float32()),
        ("scaled_estimate", pa.float32()),
    ]
)
partition_expressions = (
    df["file_gdc_id"].apply(lambda value: ds.field("file_gdc_id") == value).tolist()
)

In [33]:
dataset_from_paths = ds.FileSystemDataset.from_paths(
    paths,
    schema,
    partitions=partition_expressions,
    format=ds.CsvFileFormat(parse_options=pa.csv.ParseOptions(delimiter="\t")),
    filesystem=PyFileSystem(FSSpecHandler(fs)),
)

In [34]:
dataset_from_paths.to_table().to_pandas()

2022-04-13 22:43:29,962 - gcsfs - DEBUG - GET: b/{}/o/{}, ('gdc-tcga-phs000178-open', '0b8b258e-1671-4f86-82e7-59b12ad40d9c/unc.edu.4c243ea9-dfe1-42f0-a887-3c901fb38542.2477720.rsem.genes.results'), None
2022-04-13 22:43:29,963 - gcsfs.credentials - DEBUG - GCS refresh
2022-04-13 22:43:30,278 - gcsfs - DEBUG - GET: b/{}/o/{}, ('gdc-tcga-phs000178-open', '0b8b258e-1671-4f86-82e7-59b12ad40d9c/unc.edu.4c243ea9-dfe1-42f0-a887-3c901fb38542.2477720.rsem.genes.results'), None
2022-04-13 22:43:30,335 - gcsfs - DEBUG - GET: https://storage.googleapis.com/download/storage/v1/b/gdc-tcga-phs000178-open/o/0b8b258e-1671-4f86-82e7-59b12ad40d9c%2Func.edu.4c243ea9-dfe1-42f0-a887-3c901fb38542.2477720.rsem.genes.results?alt=media, (), {'Range': 'bytes=0-1500491'}
2022-04-13 22:43:30,340 - gcsfs - DEBUG - GET: b/{}/o/{}, ('gdc-tcga-phs000178-open', 'c8ee8367-c529-4dd6-98b4-fde57991134b/unc.edu.a64ae1f5-a189-4173-be13-903bd7637869.2476757.rsem.genes.results'), None
2022-04-13 22:43:30,461 - gcsfs - DEBUG -

Unnamed: 0,file_gdc_id,gene_id,raw_count,scaled_estimate
0,0b8b258e-1671-4f86-82e7-59b12ad40d9c,?|100130426,0.000000,0.000000e+00
1,0b8b258e-1671-4f86-82e7-59b12ad40d9c,?|100133144,11.070000,2.887527e-07
2,0b8b258e-1671-4f86-82e7-59b12ad40d9c,?|100134869,15.930000,3.024025e-07
3,0b8b258e-1671-4f86-82e7-59b12ad40d9c,?|10357,355.480011,2.085068e-05
4,0b8b258e-1671-4f86-82e7-59b12ad40d9c,?|10431,3032.000000,8.241797e-05
...,...,...,...,...
61588,78354f8d-5ce8-4617-bba4-79614f232e97,ZYX|7791,4611.000000,7.764625e-05
61589,78354f8d-5ce8-4617-bba4-79614f232e97,ZZEF1|23140,1559.000000,4.961625e-06
61590,78354f8d-5ce8-4617-bba4-79614f232e97,ZZZ3|26009,1085.000000,1.091091e-05
61591,78354f8d-5ce8-4617-bba4-79614f232e97,psiTPTE22|387590,278.000000,7.126971e-06


In [None]:
# doesn't work

# df = file_metadata_hg19.iloc[0:3]
# paths = df["file_name_key"].tolist()
# part = ds.partitioning(field_names=["file_gdc_id"])

# dataset = ds.dataset(
#     # "gdc-tcga-phs000178-open/",
#     paths,
#     filesystem=fs,
#     partitioning=part,
#     format=ds.CsvFileFormat(parse_options=pa.csv.ParseOptions(delimiter="\t")),
# )

In [None]:
%%time
table = dataset_from_paths.to_table()

In [None]:
table.take(list(range(10))).to_pandas()

In [49]:
query_hg38 = """
SELECT sample_barcode, data_category, file_type, data_format, file_name_key, file_gdc_id
FROM `isb-cgc-bq.TCGA.per_sample_file_metadata_hg38_gdc_current`
where project_short_name = "TCGA-SKCM"
    and experimental_strategy = "RNA-Seq"
    and sample_type_name = "Metastatic"
    and data_format = "TXT"
    and file_name_key like "%FPKM.txt.gz"
order by sample_barcode
;
"""

query_job = bq_client.query(query_hg38)
file_metadata_hg38 = query_job.to_dataframe(progress_bar_type="tqdm")

Query complete after 0.24s: 100%|██████████████| 2/2 [00:00<00:00,  8.30query/s]
Downloading:   0%|                                    | 0/367 [00:00<?, ?rows/s]2022-04-14 08:25:00,217 - google.cloud.bigquery._pandas_helpers - DEBUG - Started reading table 'keen-dispatch-316219._3390f5fead31fe4257c155ce95c2f1f18d7e0416.anon44539dff6ba8401a62c755bfc2a6678dbe8e4990' with BQ Storage API session 'projects/keen-dispatch-316219/locations/us/sessions/CAISDHZyRXBSOXZOa0JpbxoCamQaAmly'.
Downloading: 100%|█████████████████████████| 367/367 [00:01<00:00, 299.05rows/s]


In [50]:
file_metadata_hg38

Unnamed: 0,sample_barcode,data_category,file_type,data_format,file_name_key,file_gdc_id
0,TCGA-3N-A9WB-06A,Transcriptome Profiling,gene_expression,TXT,gs://gdc-tcga-phs000178-open/6f0f09df-cb9e-4ee...,6f0f09df-cb9e-4ee6-9f0d-e0c7bb2aa001
1,TCGA-3N-A9WC-06A,Transcriptome Profiling,gene_expression,TXT,gs://gdc-tcga-phs000178-open/571093c2-7bb3-4fb...,571093c2-7bb3-4fb9-8b91-84a9a438bdda
2,TCGA-3N-A9WD-06A,Transcriptome Profiling,gene_expression,TXT,gs://gdc-tcga-phs000178-open/0610b28d-8712-4ad...,0610b28d-8712-4ad0-a7f8-b9d9e4590cb8
3,TCGA-BF-AAP0-06A,Transcriptome Profiling,gene_expression,TXT,gs://gdc-tcga-phs000178-open/accf05f6-f841-43a...,accf05f6-f841-43ad-8d11-0568c0a77a2b
4,TCGA-D3-A1Q1-06A,Transcriptome Profiling,gene_expression,TXT,gs://gdc-tcga-phs000178-open/2507b44b-1dab-436...,2507b44b-1dab-436c-a7dc-be74b93ed978
...,...,...,...,...,...,...
362,TCGA-YG-AA3O-06A,Transcriptome Profiling,gene_expression,TXT,gs://gdc-tcga-phs000178-open/58fd2219-6153-48d...,58fd2219-6153-48dd-82e3-9fdfcc2f930d
363,TCGA-YG-AA3P-06A,Transcriptome Profiling,gene_expression,TXT,gs://gdc-tcga-phs000178-open/9533e842-d47f-476...,9533e842-d47f-4766-8fbc-29a1e8d3f364
364,TCGA-Z2-A8RT-06A,Transcriptome Profiling,gene_expression,TXT,gs://gdc-tcga-phs000178-open/f4824934-5951-493...,f4824934-5951-4938-851d-7223c0f54890
365,TCGA-Z2-AA3S-06A,Transcriptome Profiling,gene_expression,TXT,gs://gdc-tcga-phs000178-open/a50ae196-9baa-4eb...,a50ae196-9baa-4ebd-ab38-dc4388f7e221


In [51]:
file_metadata_hg38.iloc[0]["file_name_key"]

'gs://gdc-tcga-phs000178-open/6f0f09df-cb9e-4ee6-9f0d-e0c7bb2aa001/9cc93369-3cc7-4293-9f42-a997c26243c9.FPKM.txt.gz'

In [53]:
import pandas as pd

In [58]:
pd.read_csv(
    #
    # file_metadata_hg38.iloc[0]["file_name_key"],
    "9cc93369-3cc7-4293-9f42-a997c26243c9.FPKM.txt.gz",
    sep="\t",
    # header,
).sample(10)

Unnamed: 0,ENSG00000242268.2,0.0
39141,ENSG00000174498.12,0.020178
928,ENSG00000181371.3,0.0
9337,ENSG00000254937.1,0.0
48345,ENSG00000182612.9,45.58754
14023,ENSG00000254667.2,0.422559
10175,ENSG00000153044.8,10.126376
52258,ENSG00000197728.8,94.78128
37572,ENSG00000273145.1,2.506075
44160,ENSG00000265794.4,0.033283
59471,ENSG00000187583.9,1.087486


In [32]:
df = file_metadata_hg38.iloc[0:3]
paths = df["file_name_key"].tolist()
schema = pa.schema(
    [
        #
        ("file_gdc_id", pa.string()),
        ("gene_id", pa.string()),
        ("raw_count", pa.float32()),
        ("scaled_estimate", pa.float32()),
    ]
)
partition_expressions = (
    df["file_gdc_id"].apply(lambda value: ds.field("file_gdc_id") == value).tolist()
)

In [None]:
pd.merge(
    left=file_metadata_hg19,
    right=file_metadata_hg38,
    how="inner",
    on=["sample_barcode"],
    suffixes=["_hg19", "_hg38"],
)

In [None]:
import gcsfs

In [None]:
fs = gcsfs.GCSFileSystem()

In [None]:
# fs.ls("gs://liulab/")
fs.ls("gs://gdc-tcga-phs000178-open/c8ee8367-c529-4dd6-98b4-fde57991134b")

## pyarrow

In [None]:
y = x.to_table()

In [None]:
y

In [None]:
z = y.group_by("gene_id")

In [None]:
z.aggregate(
    [("scaled_estimate", "mean"), ("scaled_estimate", "stddev")]
).to_pandas().set_index("gene_id")

In [None]:
import pandas as pd

In [None]:
# uri = "gs://gdc-tcga-phs000178-open/0b8b258e-1671-4f86-82e7-59b12ad40d9c/unc.edu.4c243ea9-dfe1-42f0-a887-3c901fb38542.2477720.rsem.genes.results"
# uri = "gs://gdc-tcga-phs000178-open/*/*.rsem.genes.results"

uri = [
    "gs://gdc-tcga-phs000178-open/0b8b258e-1671-4f86-82e7-59b12ad40d9c/unc.edu.4c243ea9-dfe1-42f0-a887-3c901fb38542.2477720.rsem.genes.results",
    "gs://gdc-tcga-phs000178-open/0b8b258e-1671-4f86-82e7-59b12ad40d9c/unc.edu.4c243ea9-dfe1-42f0-a887-3c901fb38542.2477720.rsem.genes.results",
]

pd.read_csv(
    uri[0],
    nrows=10,
    sep="\t",
)

In [None]:
import pyarrow as pa
import pyarrow.dataset as ds

In [None]:
gdc_tcga_bucket = ""

# dataset = ds.dataset("