In [None]:
import polars as pl
from IPython.display import display

In [None]:
encode_files = pl.read_parquet('../output_06042025_235959/encode_files.parquet')
display(encode_files)

In [None]:
drop_cols = encode_files.drop("@type", "audit", "quality_metrics", "title")

only_experiments = drop_cols.filter(pl.col("dataset").str.starts_with("/experiments"))
filter_status_released = only_experiments.filter(pl.col("status") == "released")
drop_status = filter_status_released.drop("status")

bio_reps_to_list = drop_status.with_columns(pl.col("biological_replicates").str.json_decode(dtype=pl.List(pl.Int64)))
technical_reps_to_list = bio_reps_to_list.with_columns(pl.col("technical_replicates").str.json_decode(dtype=pl.List(pl.Utf8)))
origin_batches_to_list = technical_reps_to_list.with_columns(pl.col("origin_batches").str.json_decode(dtype=pl.List(pl.Utf8)))
derived_from_to_list = origin_batches_to_list.with_columns(pl.col("derived_from").str.json_decode(dtype=pl.List(pl.Utf8)))

clean_label = derived_from_to_list.with_columns(pl.col("target").str.json_path_match("$.label").alias("target"))
clean_biosample = clean_label.with_columns(pl.col("biosample_ontology").str.json_path_match("$.term_name").alias("biosample"))
clean_organ_slims = clean_biosample.with_columns(pl.col("biosample_ontology").str.json_path_match("$.organ_slims").str.json_decode(dtype=pl.List(pl.Utf8)).alias("organ_slims"))
clean_cell_slims = clean_organ_slims.with_columns(pl.col("biosample_ontology").str.json_path_match("$.cell_slims").str.json_decode(dtype=pl.List(pl.Utf8)).alias("cell_slims"))
clean_developmental_slims = clean_cell_slims.with_columns(pl.col("biosample_ontology").str.json_path_match("$.developmental_slims").str.json_decode(dtype=pl.List(pl.Utf8)).alias("developmental_slims"))
clean_system_slims = clean_developmental_slims.with_columns(pl.col("biosample_ontology").str.json_path_match("$.system_slims").str.json_decode(dtype=pl.List(pl.Utf8)).alias("system_slims"))
clean_classification = clean_system_slims.with_columns(pl.col("biosample_ontology").str.json_path_match("$.classification").alias("classification"))

drop_biosample_ontology = clean_classification.drop("biosample_ontology")
index_of_to_list = drop_biosample_ontology.with_columns(pl.col("index_of").str.json_decode(dtype=pl.List(pl.Utf8)).alias("index_of"))
clean_award = index_of_to_list.with_columns(pl.col("award").str.json_path_match("$.project").alias("project"))
clean_rfa = clean_award.with_columns(pl.col("award").str.json_path_match("$.rfa").alias("rfa"))
clean_platform = clean_rfa.with_columns(pl.col("platform").str.json_path_match("$.term_name").alias("platform"))
assay_slims = clean_platform.with_columns(pl.col("replicate").str.json_path_match("$.experiment.assay_slims").str.json_decode(dtype=pl.List(pl.Utf8)).alias("assay_slims"))
life_stage_age = assay_slims.with_columns(pl.col("replicate").str.json_path_match("$.experiment.life_stage_age").alias("life_stage_age"))
donors = life_stage_age.with_columns(pl.col("donors").str.json_decode(dtype=pl.List(pl.Utf8)).alias("donors"))

clean_dataset = donors.with_columns(pl.col("dataset").str.split("/").list[2].alias("experiments"))
drop_dataset = clean_dataset.drop("dataset")

clean_id = drop_dataset.with_columns(pl.col("@id").str.split("/").list[2].alias("id"))
drop_old_id = clean_id.drop("@id")

formatted_date = drop_old_id.with_columns(pl.col("date_created").cast(pl.Datetime))

file_size_mb = formatted_date.with_columns((pl.col("file_size") / (1024**2)).round().cast(pl.Int64).alias("file_size_mb"))

href_to_download_link = file_size_mb.with_columns(("https://www.encodeproject.org" + pl.col("href")).alias("download_link"))
drop_href = href_to_download_link.drop("href")

analysis_step_version_extracted = drop_href.with_columns(pl.col("analysis_step_version").str.json_decode(infer_schema_length=None).alias("analysis_step_version"))
software = analysis_step_version_extracted.with_columns(
    pl.col("analysis_step_version").struct.field("software_versions").list.eval(
        pl.element().struct.field("software").struct.field("name") +
        pl.lit(":") +
        pl.element().struct.field("version")).alias("software"))
drop_analysis_step_version = software.drop("analysis_step_version")

clean_lab = drop_analysis_step_version.with_columns(pl.col("lab").str.json_path_match("$.title").alias("lab"))
clean_step_run = clean_lab.with_columns(pl.col("step_run").str.json_path_match("$.analysis_step_version").str.split("/").list[2].alias("step_run"))

In [14]:
clean_encode_files = clean_step_run.select(['id',
                                            'accession',
                                            'experiments',
                                            'assay_term_name',
                                            'assay_title',
                                            'assay_slims',
                                            'cell_slims',
                                            'developmental_slims',
                                            'system_slims',
                                            'classification',
                                            'biosample',
                                            'organ_slims',
                                            'simple_biosample_summary',
                                            'life_stage_age',
                                            'donors',
                                            'output_category',
                                            'output_type',
                                            'target',
                                            'file_format',
                                            'file_type',
                                            'file_format_type',
                                            'download_link',
                                            'assembly',
                                            'genome_annotation',
                                            'biological_replicates',
                                            'technical_replicates',
                                            'index_of',
                                            'derived_from',
                                            'origin_batches',
                                            'paired_with',
                                            'paired_end',
                                            'platform',
                                            'read_count',
                                            'read_length',
                                            'run_type',
                                            'read_length_units',
                                            'mapped_read_length',
                                            'mapped_run_type',
                                            'step_run',
                                            'preferred_default',
                                            'file_size',
                                            'file_size_mb',
                                            'md5sum',
                                            'date_created',
                                            'rfa',
                                            'lab',
                                            'software']).sort("id")

display(clean_encode_files)

id,accession,experiments,assay_term_name,assay_title,assay_slims,cell_slims,developmental_slims,system_slims,classification,biosample,organ_slims,simple_biosample_summary,life_stage_age,donors,output_category,output_type,target,file_format,file_type,file_format_type,download_link,assembly,genome_annotation,biological_replicates,technical_replicates,index_of,derived_from,origin_batches,paired_with,paired_end,platform,read_count,read_length,run_type,read_length_units,mapped_read_length,mapped_run_type,step_run,preferred_default,file_size,file_size_mb,md5sum,date_created,rfa,lab,software
str,str,str,str,str,list[str],list[str],list[str],list[str],str,str,list[str],str,str,list[str],str,str,str,str,str,str,str,str,str,list[i64],list[str],list[str],list[str],list[str],str,str,str,i64,i64,str,str,i64,str,str,bool,i64,i64,str,datetime[μs],str,str,list[str]
"""ENCFF000AAS""","""ENCFF000AAS""","""ENCSR000AWO""","""transcription profiling by arr…","""RNA microarray""","[""Transcription""]","[""hematopoietic cell"", ""leukocyte"", ""B cell""]","[""mesoderm""]","[""immune system""]","""cell line""","""GM12878""","[""blood"", ""bodily fluid""]","""""",,"[""/human-donors/ENCDO000AAK/""]","""annotation""","""filtered transcribed fragments""",,"""bigBed""","""bigBed broadPeak""","""broadPeak""","""https://www.encodeproject.org/…","""hg19""",,[1],"[""1_1""]",,"[""/files/ENCFF001SMG/""]","[""/biosamples/ENCBS630AAA/""]",,,,,,,,,,,,18899809,18,"""05dfc69e84c8681206079ed78a65b0…",2008-11-24 00:00:00,"""ENCODE2""","""Thomas Gingeras, CSHL""",
"""ENCFF000AAT""","""ENCFF000AAT""","""ENCSR000AWK""","""transcription profiling by arr…","""RNA microarray""","[""Transcription""]","[""hematopoietic cell"", ""leukocyte"", ""B cell""]","[""mesoderm""]","[""immune system""]","""cell line""","""GM12878""","[""blood"", ""bodily fluid""]","""cytosolic fraction""",,"[""/human-donors/ENCDO000AAK/""]","""annotation""","""filtered transcribed fragments""",,"""bigBed""","""bigBed broadPeak""","""broadPeak""","""https://www.encodeproject.org/…","""hg19""",,[1],"[""1_1""]",,"[""/files/ENCFF001SMH/""]","[""/biosamples/ENCBS217CXJ/""]",,,,,,,,,,,,17267422,16,"""eda0ff0d84c849586fe5a7e387eb79…",2008-11-24 00:00:00,"""ENCODE2""","""Thomas Gingeras, CSHL""",
"""ENCFF000AAU""","""ENCFF000AAU""","""ENCSR000AWY""","""transcription profiling by arr…","""RNA microarray""","[""Transcription""]","[""hematopoietic cell"", ""leukocyte"", ""B cell""]","[""mesoderm""]","[""immune system""]","""cell line""","""GM12878""","[""blood"", ""bodily fluid""]","""nucleolus fraction""",,"[""/human-donors/ENCDO000AAK/""]","""annotation""","""filtered transcribed fragments""",,"""bigBed""","""bigBed broadPeak""","""broadPeak""","""https://www.encodeproject.org/…","""hg19""",,[1],"[""1_1""]",,"[""/files/ENCFF001SMJ/""]","[""/biosamples/ENCBS495SKR/""]",,,,,,,,,,,,19110317,18,"""9c25373f5713819b74f58e0c8e47df…",2008-11-24 00:00:00,"""ENCODE2""","""Thomas Gingeras, CSHL""",
"""ENCFF000AAV""","""ENCFF000AAV""","""ENCSR000AWA""","""transcription profiling by arr…","""RNA microarray""","[""Transcription""]","[""hematopoietic cell"", ""leukocyte"", ""B cell""]","[""mesoderm""]","[""immune system""]","""cell line""","""GM12878""","[""blood"", ""bodily fluid""]","""cytosolic fraction""",,"[""/human-donors/ENCDO000AAK/""]","""annotation""","""filtered transcribed fragments""",,"""bigBed""","""bigBed broadPeak""","""broadPeak""","""https://www.encodeproject.org/…","""hg19""",,[1],"[""1_1""]",,"[""/files/ENCFF001SMI/""]","[""/biosamples/ENCBS217CXJ/""]",,,,,,,,,,,,17691708,17,"""8e3baafd1b91c31df6ac07c0f35b0d…",2008-11-24 00:00:00,"""ENCODE2""","""Thomas Gingeras, CSHL""",
"""ENCFF000AAW""","""ENCFF000AAW""","""ENCSR000AWB""","""transcription profiling by arr…","""RNA microarray""","[""Transcription""]","[""hematopoietic cell"", ""leukocyte"", ""B cell""]","[""mesoderm""]","[""immune system""]","""cell line""","""GM12878""","[""blood"", ""bodily fluid""]","""nuclear fraction""",,"[""/human-donors/ENCDO000AAK/""]","""annotation""","""filtered transcribed fragments""",,"""bigBed""","""bigBed broadPeak""","""broadPeak""","""https://www.encodeproject.org/…","""hg19""",,[1],"[""1_1""]",,"[""/files/ENCFF001SMK/""]","[""/biosamples/ENCBS778MKB/""]",,,,,,,,,,,,22342554,21,"""196aec0307509697edb2f7b98b0e6d…",2008-11-24 00:00:00,"""ENCODE2""","""Thomas Gingeras, CSHL""",
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""SRR2173350""",,"""ENCSR529CLR""","""microRNA-seq""","""microRNA-seq""","[""Transcription""]","[""embryonic cell"", ""stem cell""]",[],[],"""cell line""","""UCSF-4""","[""embryo""]","""""",,"[""/human-donors/ENCDO499GAU/""]","""raw data""","""reads""",,"""sra""","""sra""",,"""https://www.encodeproject.org/…",,,[1],"[""1_1""]",,,"[""/biosamples/ENCBS311CPE/""]",,,"""Illumina HiSeq 2000""",14799630,31,"""single-ended""","""nt""",,,,,543796448,519,"""5567eacac80b3858ac3ba6cfd733ec…",2017-03-22 22:37:17.176468,"""Roadmap""","""Joseph Costello, UCSF""",
"""SRR2173351""",,"""ENCSR529CLR""","""microRNA-seq""","""microRNA-seq""","[""Transcription""]","[""embryonic cell"", ""stem cell""]",[],[],"""cell line""","""UCSF-4""","[""embryo""]","""""",,"[""/human-donors/ENCDO499GAU/""]","""raw data""","""reads""",,"""sra""","""sra""",,"""https://www.encodeproject.org/…",,,[2],"[""2_1""]",,,"[""/biosamples/ENCBS838AWQ/""]",,,"""Illumina HiSeq 2000""",18449835,31,"""single-ended""","""nt""",,,,,673121696,642,"""d84bbe323229866ac74ac9ca1ba649…",2017-03-22 22:37:29.831138,"""Roadmap""","""Joseph Costello, UCSF""",
"""SRR2173356""",,"""ENCSR282KJZ""","""polyA plus RNA-seq""","""polyA plus RNA-seq""","[""Transcription""]","[""embryonic cell"", ""stem cell""]",[],[],"""cell line""","""UCSF-4""","[""embryo""]","""""",,"[""/human-donors/ENCDO499GAU/""]","""raw data""","""reads""",,"""sra""","""sra""",,"""https://www.encodeproject.org/…",,,[1],"[""1_1""]",,,"[""/biosamples/ENCBS838AWQ/""]",,"""1,2""","""Illumina HiSeq 2000""",251556314,76,"""paired-ended""","""nt""",,,,,27984092576,26688,"""17561ba432a9d3259d3bb9c6d4d430…",2017-03-22 22:37:12.101115,"""Roadmap""","""Joseph Costello, UCSF""",
"""SRR2173357""",,"""ENCSR282KJZ""","""polyA plus RNA-seq""","""polyA plus RNA-seq""","[""Transcription""]","[""embryonic cell"", ""stem cell""]",[],[],"""cell line""","""UCSF-4""","[""embryo""]","""""",,"[""/human-donors/ENCDO499GAU/""]","""raw data""","""reads""",,"""sra""","""sra""",,"""https://www.encodeproject.org/…",,,[1],"[""1_2""]",,,"[""/biosamples/ENCBS838AWQ/""]",,"""1,2""","""Illumina HiSeq 2000""",257363948,76,"""paired-ended""","""nt""",,,,,26218223456,25004,"""c0e32e339d8b3abaf2091351c8d6ef…",2017-03-22 22:37:05.542217,"""Roadmap""","""Joseph Costello, UCSF""",


In [15]:
clean_encode_files.write_parquet('../output_encode_matrix_by_accession/clean_encode_files.parquet')