In [3]:
import numpy as np
import pandas as pd
from Bio import SeqIO
from pathlib import Path

In [4]:
for label in ["minus", "plus"]:
    with open(f"../output/RTS_counts/{label}.wig") as f:
        lines = f.readlines()
    lines = [l.strip() for l in lines if l != "\n"]
    table = ["id\tcount\n"]
    for l in lines:
        if l.startswith("ref|"):
            id = l
        else:
            _, count, _ = l.split("\t")
            table.append(f"{id}\t{count}\n")
    print(table[:5])
    with open(f"../output/RTS_counts/{label}.tsv", "w") as f:
        f.writelines(table)

['id\tcount\n', 'ref|NC_001133|\t0\n', 'ref|NC_001133|\t0\n', 'ref|NC_001133|\t0\n', 'ref|NC_001133|\t0\n']
['id\tcount\n', 'ref|NC_001133|\t0\n', 'ref|NC_001133|\t0\n', 'ref|NC_001133|\t0\n', 'ref|NC_001133|\t0\n']


In [5]:
plus = pd.read_csv("../output/RTS_counts/plus.tsv", sep="\t")
minus = pd.read_csv("../output/RTS_counts/minus.tsv", sep="\t")

In [6]:
plus.shape, minus.shape

((12156676, 2), (12156676, 2))

In [7]:
plus.head()

Unnamed: 0,id,count
0,ref|NC_001133|,0
1,ref|NC_001133|,0
2,ref|NC_001133|,0
3,ref|NC_001133|,0
4,ref|NC_001133|,0


In [8]:
minus.head()

Unnamed: 0,id,count
0,ref|NC_001133|,0
1,ref|NC_001133|,0
2,ref|NC_001133|,0
3,ref|NC_001133|,0
4,ref|NC_001133|,0


In [11]:
with open("../data/S288C_reference_genome_R62-1-1_20090218/S288C_reference_sequence_R62-1-1_20090218.fsa") as f:
    ref = list(SeqIO.parse(f, "fasta"))

In [12]:
for r in ref:
    # get chr name\n",
    try:
        chr = "chr" + r.description.split("chromosome=")[1].split("]")[0]
    except IndexError:
        chr = "chrMito"

    signal_plus = np.array(plus[plus["id"] == r.id]["count"])
    signal_minus = np.array(minus[minus["id"] == r.id]["count"])

    # ReLU of difference between plus and minus signals
    count = np.maximum(signal_plus - signal_minus, np.zeros(len(r.seq)))

    # 2-8% Normalization
    top10 = count[(-count).argsort()][:int(0.1 * len(count))]
    count = count / np.mean(top10[int(0.2 * len(top10)):])

    # save to binary
    Path(f"../output/DMS_signal/").mkdir(parents=True, exist_ok=True)
    with open(f"../output/DMS_signal/{chr}.bin", "wb") as f:
        np.save(f, count)