In [1]:
import polars as pl
from IPython.display import display
import json
import ast

In [None]:
encode_df = pl.read_parquet('../encode_files.parquet')

In [None]:
for column in encode_df.iter_columns():
    display(column.unique())

In [None]:
display(encode_df)

In [None]:
# Convert "NA" to None

encode_df = pl.DataFrame([column.replace({"NA": None}) for column in encode_df.iter_columns()])
display(encode_df.head())

In [None]:
# Drop last 4 columns (contain no information)

encode_df = encode_df.drop(pl.col("annotation_type", "annotation_subtype", "biochemical_inputs", "encyclopedia_version"))
display(encode_df.head())

In [None]:
# Convert select columns to numeric


encode_df = encode_df.with_columns(pl.col("file_size", "read_length", "mapped_read_length", "cropped_read_length", "cropped_read_length_tolerance").cast(pl.Float64))
display(encode_df)

In [None]:
# Convert select columns to boolean

encode_df = encode_df.with_columns(
    pl.col("preferred_default", "restricted").map_elements(
        lambda x: True if x.lower() in ["true"] else 
                  False if x.lower() in ["false"] else None, return_dtype=pl.Boolean)
)
display(encode_df)

In [None]:
# Convert select columns to datetime

encode_df = encode_df.with_columns(pl.col("date_created").cast(pl.Datetime))
display(encode_df)


In [None]:
# Test ast.literal_eval

example = "[]"
result = ast.literal_eval(example)
print(result)
print(type(result))

In [None]:
# Convert select columns to valid JSON

encode_df = encode_df.with_columns(pl.col("technical_replicates", "biological_replicates", "organ_slims", "origin_batches", "derived_from").map_elements(lambda x: json.loads(x.replace("'", '"'))))
display(encode_df)

In [None]:
# Inspect data

for column in encode_df["quality_metrics", "step_run", "analysis_step_version"].iter_columns():
    display(column.to_list())

In [None]:
# Convert select columns to valid JSON

encode_df = encode_df.drop(pl.col("quality_metrics", "step_run", "analysis_step_version")) # json not playing nice with parser, opting to drop it
display(encode_df)

In [None]:
# Done

clean_encode = encode_df

In [3]:
inspect = pl.read_parquet('../clean_encode_files.parquet')
display(inspect.head())

title,accession,dataset,assembly,technical_replicates,biological_replicates,file_format,file_type,file_format_type,file_size,assay_term_name,term_name,organ_slims,simple_biosample_summary,origin_batches,label,download_url,derived_from,genome_annotation,paired_end,paired_with,preferred_default,run_type,read_length,mapped_read_length,cropped_read_length,cropped_read_length_tolerance,mapped_run_type,read_length_units,output_category,output_type,index_of,lab_title,project,date_created,restricted,submitter_comment,status
str,str,str,str,list[str],list[i64],str,str,str,f64,str,str,list[str],str,list[str],str,str,list[str],str,str,str,bool,str,f64,f64,f64,f64,str,str,str,str,str,str,str,datetime[μs],bool,str,str
"""ENCFF566NGG""","""ENCFF566NGG""","""/experiments/ENCSR369MDF/""","""GRCh38""","[""1_1""]",[1],"""bam""","""bam""",,3936100000.0,"""RNA-seq""","""K562""","[""blood"", ""bodily fluid""]","""treated with 5 μM JQ1 for 4 ho…","[""/biosamples/ENCBS291NHT/""]",,"""https://encodeproject.org/file…","[""/files/ENCFF598IDH/"", ""/files/ENCFF146IQN/"", ""/files/ENCFF967MFO/""]","""V29""",,,,,,150.0,,,"""paired-ended""","""nt""","""alignment""","""transcriptome alignments""",,"""ENCODE Processing Pipeline""","""ENCODE""",2021-12-31 08:51:07.433754,,,"""released"""
"""ENCFF642UFR""","""ENCFF642UFR""","""/experiments/ENCSR250SCW/""","""mm10""","[""2_1""]",[2],"""tar""","""tar""",,4117922.0,"""single-cell RNA sequencing ass…","""left cerebral cortex""","[""brain""]","""male adult (2 months) strain B…",,,"""https://encodeproject.org/file…","[""/files/ENCFF723SJO/"", ""/files/ENCFF192SYW/""]","""M21""",,,True,,,,,,,,"""quantification""","""unfiltered sparse gene count m…",,"""Barbara Wold, Caltech""","""ENCODE""",2022-01-30 04:31:33.614985,,,"""released"""
"""ENCFF094MID""","""ENCFF094MID""","""/experiments/ENCSR414JJE/""",,"[""1_2""]",[1],"""fastq""","""fastq""",,263567745.0,"""Mint-ChIP-seq""","""naive B cell""","[""blood"", ""bodily fluid""]","""female adult (39 years)""","[""/biosamples/ENCBS517ZRS/""]","""H3K27me3""","""https://encodeproject.org/file…",,,"""2""","""/files/ENCFF481AOK/""",,"""paired-ended""",58.0,,,,,"""nt""","""raw data""","""reads""",,"""Bradley Bernstein, Broad""","""ENCODE""",2022-02-11 14:58:17.561304,,,"""released"""
"""ENCFF506TJN""","""ENCFF506TJN""","""/experiments/ENCSR369MDF/""","""GRCh38""","[""2_1""]",[2],"""bigWig""","""bigWig""",,120998658.0,"""RNA-seq""","""K562""","[""blood"", ""bodily fluid""]","""treated with 5 μM JQ1 for 4 ho…","[""/biosamples/ENCBS798JQV/""]",,"""https://encodeproject.org/file…","[""/files/ENCFF124OPS/"", ""/files/GRCh38_EBV.chrom.sizes/""]","""V29""",,,,,,,,,,,"""signal""","""signal of all reads""",,"""ENCODE Processing Pipeline""","""ENCODE""",2021-12-31 08:51:14.572055,,,"""released"""
"""ENCFF174IGS""","""ENCFF174IGS""","""/experiments/ENCSR040TXN/""",,"[""1_2""]",[1],"""fastq""","""fastq""",,45801960.0,"""Mint-ChIP-seq""","""central memory CD8-positive, a…",[],"""male adult (36 years)""","[""/biosamples/ENCBS181QGR/""]","""H3K4me3""","""https://encodeproject.org/file…",,,"""2""","""/files/ENCFF066EBL/""",,"""paired-ended""",58.0,,,,,"""nt""","""raw data""","""reads""",,"""Bradley Bernstein, Broad""","""ENCODE""",2022-02-11 15:54:31.696914,,,"""released"""
