In [1]:
from pysam import FastaFile
dna = FastaFile("../genome/AT_reference/GCF_000001735.4_TAIR10.1_genomic.fna")
print(dna.references)
chr_to_references = {
    "1": "CHR1",
    "2": "CHR2",
    "3": "CHR3",
    "4": "CHR4",
    "5": "CHR5",
}
a = dna.fetch(chr_to_references["1"], 0, 1000)

print(1,dna.get_reference_length(chr_to_references["1"]))
print(2,dna.get_reference_length(chr_to_references["2"]))
print(3,dna.get_reference_length(chr_to_references["3"]))
print(4,dna.get_reference_length(chr_to_references["4"]))
print(5,dna.get_reference_length(chr_to_references["5"]))


encode_dict = {
    "[mask]": 0,
    "A": 1,
    "C": 2,
    "G": 3,
    "T": 4,
    "Y": 5, # C or T
    "R": 6, # A or G
    "W": 7, # A or T
    "S": 8, # C or G
    "M": 9, # A or C
    "K": 10, # G or T
    "B": 11, # C or G or T
    "D": 12, # A or G or T
    "H": 13, # A or C or T
    "V": 14, # A or C or G
    "a": 1,
    "c": 2,
    "g": 3,
    "t": 4,
    "N": -1,
}

encode_bases = lambda bases: [encode_dict[base] for base in bases]



['CHR1', 'CHR2', 'CHR3', 'CHR4', 'CHR5', 'NC_037304.1', 'NC_000932.1']
1 30427671
2 19698289
3 23459830
4 18585056
5 26975502


In [2]:
import polars as pl
import numpy as np

def meth_rates_to_labels(dna, meth_rates: pl.DataFrame):
   

    labels_by_chrsm = {}
    for (chrsm, rates) in meth_rates.partition_by("chrsm", as_dict=True).items():
        print(chrsm, rates.height)
        sequence = pl.DataFrame({ "sequence": encode_bases(dna.fetch(chr_to_references[f"{chrsm}"]))})
        a = np.zeros(sequence.height, dtype=np.float32)
        b = np.zeros(sequence.height, dtype=np.float32)
        std_st = np.zeros(sequence.height, dtype=np.float32)
        for row in rates.iter_rows(named=True):
            
            a[row["start"] - 1 :row["end"] -1] = row["alpha"]
            b[row["start"] - 1 :row["end"] -1] = row["beta"]
            std_st[row["start"] - 1:row["end"] -1] = row["std_st"]

        labels = sequence.with_columns(pl.Series("alpha", a), pl.Series("beta", b), pl.Series("std_st", std_st))

        labels = labels.with_columns(pl.when(pl.col("sequence") == 2).then(pl.col("alpha")).otherwise(pl.lit(0.)).alias("alpha"))
        labels = labels.with_columns(pl.when(pl.col("sequence") == 2).then(pl.col("beta")).otherwise(pl.lit(0.)).alias("beta"))
        labels = labels.with_columns(pl.when(pl.col("sequence") == 2).then(pl.col("std_st")).otherwise(pl.lit(0.)).alias("std_st"))

        labels_by_chrsm[chrsm] = labels

    return labels_by_chrsm




In [3]:
meth_rates = pl.read_parquet("alphabeta_512.parquet")
display(meth_rates)
display(meth_rates.with_columns((pl.col("end") - pl.col("start")).alias("diff")).mean())

labels_by_chrms = meth_rates_to_labels(dna, meth_rates)
display(labels_by_chrms[1][100:130])



chrsm,slice,start,end,alpha,beta,std_st
i64,i64,i64,i64,f64,f64,f64
1,0,109,13847,0.000358,0.000777,0.315301
1,1,14028,25710,-0.000013,-0.000015,0.462841
1,2,25773,37623,0.000143,0.000895,0.137958
1,3,37633,49293,0.000266,0.00173,0.13333
1,4,49298,58021,-0.000002,-0.000005,0.233663
1,5,58023,67408,0.000375,0.002987,0.111565
1,6,67422,76161,0.000296,0.002415,0.109145
1,7,76167,84725,0.000422,0.004273,0.089841
1,8,84741,95899,0.000234,0.004251,0.052193
1,9,95905,105721,0.000044,0.005389,0.008037


chrsm,slice,start,end,alpha,beta,std_st,diff
f64,f64,f64,f64,f64,f64,f64,f64
2.945394,1121.503953,12259000.0,12270000.0,0.000897,0.004569,0.230539,10910.490991


1 2725
2 1788
3 2184
4 1718
5 2463


sequence,alpha,beta,std_st
i64,f32,f32,f32
2,0.0,0.0,0.0
2,0.0,0.0,0.0
4,0.0,0.0,0.0
1,0.0,0.0,0.0
1,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
2,0.0,0.0,0.0
2,0.000358,0.000777,0.315301
3,0.0,0.0,0.0


In [9]:
for (chrsm, labels) in labels_by_chrms.items():
    labels.write_parquet(f"./labels/{chrsm}.parquet")
    labels.to_numpy().tofile(f"./labels/{chrsm}.bin")
