In [1]:
import numpy as np
import pandas as pd
from Bio import SeqIO
from pathlib import Path

In [2]:
for label in ["minus", "plus"]:
    with open(f"../output/RTS_counts/{label}.tab") as f:
        lines = f.readlines()
    lines = [l.strip() for l in lines if l != "\n"]
    table = ["id\tbase\tcount\n"]
    for l in lines:
        if l.startswith("chr"):
            id = l
        else:
            base, count, _ = l.split("\t")
            table.append(f"{id}\t{base}\t{count}\n")
    print(table[:5])
    with open(f"../output/RTS_counts/{label}.tsv", "w") as f:
        f.writelines(table)

['id\tbase\tcount\n', 'chrI\tC\t0\n', 'chrI\tC\t0\n', 'chrI\tA\t0\n', 'chrI\tC\t0\n']
['id\tbase\tcount\n', 'chrI\tC\t0\n', 'chrI\tC\t0\n', 'chrI\tA\t0\n', 'chrI\tC\t0\n']


In [3]:
plus = pd.read_csv("../output/RTS_counts/plus.tsv", sep="\t")
minus = pd.read_csv("../output/RTS_counts/minus.tsv", sep="\t")

In [4]:
plus.shape, minus.shape

((12156676, 3), (12156676, 3))

In [5]:
plus.head()

Unnamed: 0,id,base,count
0,chrI,C,0
1,chrI,C,0
2,chrI,A,0
3,chrI,C,0
4,chrI,A,0


In [6]:
minus.head()

Unnamed: 0,id,base,count
0,chrI,C,0
1,chrI,C,0
2,chrI,A,0
3,chrI,C,0
4,chrI,A,0


In [7]:
with open("../data/S288C_reference_genome_R62-1-1_20090218/S288C_reference_sequence_R62-1-1_20090218_adj.fasta") as f:
    ref = list(SeqIO.parse(f, "fasta"))

In [15]:
for r in ref:
    # get chr name\n",
    try:
        chr = "chr" + r.description.split("chromosome=")[1].split("]")[0]
    except IndexError:
        chr = "chrMito"

    assert (plus["base"] == minus["base"]).all()

    count_plus = np.array(plus[plus["id"] == r.id]["count"])
    count_minus = np.array(minus[minus["id"] == r.id]["count"])

    # ReLU of difference between plus and minus signals
    count = np.maximum(count_plus - count_minus, np.zeros(len(r.seq)))

    # 2-8% Normalization
    top10 = count[(-count).argsort()][:int(0.1 * len(count))]
    signal = count / np.mean(top10[int(0.2 * len(top10)):])
    df = pd.DataFrame(
        {
            "base": plus[plus["id"] == r.id]["base"],
            "signal": signal
        }
    )
    df.loc[df["base"] == "T", "signal"] = 0
    df.loc[df["base"] == "G", "signal"] = 0

    # save to tsv
    Path(f"../output/DMS_signal/").mkdir(parents=True, exist_ok=True)
    df.to_csv(f"../output/DMS_signal/{chr}.tsv", sep="\t", index=False)