In [1]:
import polars as pl
from IPython.display import display

In [None]:
encode_experiments = pl.read_parquet("../encode_experiments.parquet")
display(encode_experiments)

In [None]:
files_col_to_list = encode_experiments.with_columns(pl.col("files").str.json_decode())
files_col_exploded = files_col_to_list.explode("files")
unpacked_files = files_col_exploded.with_columns(pl.col("files").struct.field("*").alias("@id"))
unpacked_files_shortened = unpacked_files.select("@id", "assay_title", "biosample_summary", "dbxrefs", "description")
display(unpacked_files_shortened)

In [None]:
encode = pl.read_parquet("../encode_files.parquet")
display(encode)

In [None]:
only_experiments = encode.filter(pl.col("dataset").str.starts_with("/experiments/").alias("files"))
display(only_experiments)

In [None]:
joined = only_experiments.join(unpacked_files_shortened, on='@id', how='inner')
display(joined)

In [None]:
output_type_list = ['plus strand signal of unique reads', 'minus strand signal of unique reads', 'signal of unique reads']
filter_by_assay = joined.filter(pl.col("assay_term_name").is_in(["RNA-seq", "RAMPAGE", "CAGE", "BruChase-seq", "PRO-cap", "BruUV-seq", "Bru-seq"]))
filter_by_biosample = filter_by_assay.filter(pl.col("biosample_ontology").str.json_path_match("$.term_name").str.contains("K562"))
filter_by_assembly = filter_by_biosample.filter(pl.col("assembly").str.contains("GRCh38"))
filter_by_status = filter_by_assembly.filter(pl.col("status").str.contains("released"))
filter_by_file_type = filter_by_status.filter(pl.col("file_type").str.contains("bigWig"))
# filter_by_output_type = filter_by_file_type.filter(pl.col("output_type").is_in(output_type_list))
filter_by_output_type = filter_by_file_type.fitler(pl.col("output_type").)

display(filter_by_output_type.sort("dataset"))

In [None]:
print(filter_by_output_type["assay_title"].sort().value_counts())

In [None]:
# Counts output of assay column of /zata/zippy/andrewsg/transcription-vae/data/tabular_data/encode_metadata.tsv

"""
shape: (8, 2)
┌───────────────────┬───────┐
│ assay             ┆ count │
│ ---               ┆ ---   │
│ str               ┆ u32   │
╞═══════════════════╪═══════╡
│ BruUV-seq         ┆ 4     │
│ RAMPAGE           ┆ 4     │
│ Bru-seq           ┆ 8     │
│ CAGE              ┆ 12    │
│ PRO-cap           ┆ 40    │
│ RNA-seq           ┆ 190   │
│ BruChase-seq      ┆ 8     │
│ long read RNA-seq ┆ 7     │
└───────────────────┴───────┘
"""