# using duckdb

In [1]:
import upath

import duckdb
import gcsfs
import pyarrow as pa
import pyarrow.dataset as ds

In [2]:
path_root = upath.UPath("gs://liulab/differential_composition_and_expression/20230310_21h01m15s")

## make pyarrow table

In [13]:
paths = list(path_root.glob("**/gene_stats_*.parquet"))
dataset = ds.dataset(
    paths,
    format="parquet",
    filesystem=gcsfs.GCSFileSystem(),
    partitioning="hive",
)

In [11]:
# works, no filename
table_gene_stats = dataset.head(100)

In [34]:
# doesn't work
columns = dataset.schema.names + ["__filename"]  # ["__last_in_fragment"]
scanner = dataset.scanner(columns=columns)
table_gene_stats = scanner.head(100)

In [33]:
columns = dict(map(lambda x: (x, ds.field(x)), dataset.schema.names))
columns["filename"] = ds.field("__filename")
scanner = dataset.scanner(columns=columns)
# table_gene_stats = scanner.head(100)

ArrowInvalid: No match for FieldRef.Name(__filename) in gene_symbol: string
pval: double
fold_change: double
sparsity_overall: double
-log10_pval: double
log2_fold_change: double
-log10_pval_signed: double
significant_bh_fdr=0.10: bool
significant_bh_fdr=0.25: bool
perturbed: bool
experiment_id: int32
malignant_means: string
log2_fc: string
run_id: int32

In [29]:
new_columns = table_gene_stats.column_names
new_columns[-1] = "filename"
table_gene_stats = table_gene_stats.rename_columns(new_columns)

In [5]:
table_gene_stats2 = table_gene_stats.select(["gene_symbol", "__filename"])
table_gene_stats2

pyarrow.Table
gene_symbol: string
__filename: string
----
gene_symbol: [["A1BG","A2M","A2ML1","A4GALT","A4GNT",...,"ABTB2","ACAA1","ACAA2","ACACA","ACACB"]]
__filename: [["gs://liulab/differential_composition_and_expression/20230310_21h01m15s/experiment_id=000/malignant_means=0.55,0.85/log2_fc=-1.50/run_id=00/deg_analysis/gene_stats_bulk.parquet","gs://liulab/differential_composition_and_expression/20230310_21h01m15s/experiment_id=000/malignant_means=0.55,0.85/log2_fc=-1.50/run_id=00/deg_analysis/gene_stats_bulk.parquet","gs://liulab/differential_composition_and_expression/20230310_21h01m15s/experiment_id=000/malignant_means=0.55,0.85/log2_fc=-1.50/run_id=00/deg_analysis/gene_stats_bulk.parquet","gs://liulab/differential_composition_and_expression/20230310_21h01m15s/experiment_id=000/malignant_means=0.55,0.85/log2_fc=-1.50/run_id=00/deg_analysis/gene_stats_bulk.parquet","gs://liulab/differential_composition_and_expression/20230310_21h01m15s/experiment_id=000/malignant_means=0.55,0.85/l

## use duckdb

### using "db" api

In [5]:
import pyarrow as pa

pa.__version__

# print OS and Python version and pyarrow version
import platform
import sys

print(platform.platform())
print("Python", sys.version)
print("pyarrow", pa.__version__)

Linux-4.19.0-23-cloud-amd64-x86_64-with-glibc2.28
Python 3.11.0 | packaged by conda-forge | (main, Jan 14 2023, 12:27:40) [GCC 11.3.0]
pyarrow 11.0.0


In [30]:
# connect to an in-memory database
con = duckdb.connect()

In [35]:
con.query("select 1").fetchall()

[(1,)]

In [None]:
con.execute("SELECT * FROM table_gene_stats limit 5").fetch_df()

: 

: 

In [None]:
con.execute("SELECT __filename FROM table_gene_stats limit 1000").fetch_df()

In [None]:
con.execute(
    """
select
    malignant_means,
    log2_fc,
    run_id,
    --origin,
    count(*)
from table_gene_stats
group by 1, 2, 3
"""
).fetch_df()

In [None]:
con.execute("SELECT * FROM table_gene_stats limit 1000").fetch_df()

### using "relational" API

In [None]:
rel_from_arrow = duckdb.arrow(table_gene_stats)

In [None]:
rel_from_arrow.query("table_gene_stats", "SELECT * FROM table_gene_stats LIMIT 100").df()