In [None]:
import polars as pl

In [None]:
encode_files = pl.read_parquet('../encode_files.parquet')
display(encode_files)

In [None]:
drop_cols = encode_files.drop("@type", "audit", "quality_metrics", "replicate")

only_experiments = drop_cols.filter(pl.col("dataset").str.starts_with("/experiments"))
filter_status_released = only_experiments.filter(pl.col("status") == "released")

bio_reps_to_list = filter_status_released.with_columns(pl.col("biological_replicates").str.json_decode(dtype=pl.List(pl.Int64)))
technical_reps_to_list = bio_reps_to_list.with_columns(pl.col("technical_replicates").str.json_decode(dtype=pl.List(pl.Utf8)))
origin_batches_to_list = technical_reps_to_list.with_columns(pl.col("origin_batches").str.json_decode(dtype=pl.List(pl.Utf8)))
derived_from_to_list = origin_batches_to_list.with_columns(pl.col("derived_from").str.json_decode(dtype=pl.List(pl.Utf8)))

clean_label = derived_from_to_list.with_columns(pl.col("target").str.json_path_match("$.label").alias("label"))
clean_biosample = clean_label.with_columns(pl.col("biosample_ontology").str.json_path_match("$.term_name").alias("biosample"))
clean_organ_slims = clean_biosample.with_columns(pl.col("biosample_ontology").str.json_path_match("$.organ_slims").str.json_decode(dtype=pl.List(pl.Utf8)).alias("organ_slims"))
index_of_to_list = clean_organ_slims.with_columns(pl.col("index_of").str.json_decode(dtype=pl.List(pl.Utf8)).alias("index_of"))
drop_biosample_ontology = index_of_to_list.drop("biosample_ontology")
clean_award = drop_biosample_ontology.with_columns(pl.col("award").str.json_path_match("$.project").alias("award"))

clean_dataset = clean_award.with_columns(pl.col("dataset").str.split("/").list[2].alias("experiments"))
drop_dataset = clean_dataset.drop("dataset")

clean_id = drop_dataset.with_columns(pl.col("@id").str.split("/").list[2].alias("id"))
drop_old_id = clean_id.drop("@id")

formatted_date = drop_old_id.with_columns(pl.col("date_created").cast(pl.Datetime))

file_size_mb = formatted_date.with_columns((pl.col("file_size") / (1024**2)).round().cast(pl.Int64).alias("file_size_mb"))

href_to_download_link = file_size_mb.with_columns(("https://www.encodeproject.org" + pl.col("href")).alias("download_link"))
drop_href = href_to_download_link.drop("href")

analysis_step_version_extracted = drop_href.with_columns(pl.col("analysis_step_version").str.json_decode(infer_schema_length=None).alias("analysis_step_version"))
software = analysis_step_version_extracted.with_columns(
    pl.col("analysis_step_version").struct.field("software_versions").list.eval(
        pl.element().struct.field("software").struct.field("name") +
        pl.lit(":") +
        pl.element().struct.field("version")).alias("software"))
drop_analysis_step_version = software.drop("analysis_step_version")

clean_lab = drop_analysis_step_version.with_columns(pl.col("lab").str.json_path_match("$.title").alias("lab"))
clean_step_run = clean_lab.with_columns(pl.col("step_run").str.json_path_match("$.analysis_step_version").str.split("/").list[2].alias("step_run"))

clean_encode_files = clean_step_run

In [None]:
display(clean_encode_files)