In [68]:
import numpy as np
import pandas as pd
from Bio import SeqIO

In [69]:
for label in ["minus", "plus"]:
    with open(f"../data/dms_signal/GSM1297510_Yeast_10c_{label}.wig") as f:
        lines = f.readlines()
    lines = [l.strip().replace("\t", "") for l in lines[1:]]
    table = ["chr\tpos\ttsc\n"]
    for l in lines:
        if l.startswith("variableStep"):
            chr = l.split(" ")[1].split("=")[1]
        else:
            pos, signal = l.split(" ")
            table.append(f"{chr}\t{pos}\t{signal}\n")
    print(table[:5])
    with open(f"../data/dms_signal/{label}.tsv", "w") as f:
        f.writelines(table)

['chr\tpos\ttsc\n', 'chrI\t2953\t1.0\n', 'chrI\t3637\t1.0\n', 'chrI\t3639\t1.0\n', 'chrI\t3793\t1.0\n']
['chr\tpos\ttsc\n', 'chrI\t794\t1.0\n', 'chrI\t5881\t1.0\n', 'chrI\t6753\t1.0\n', 'chrI\t6984\t1.0\n']


In [70]:
plus = pd.read_csv("../data/dms_signal/plus.tsv", sep="\t")
minus = pd.read_csv("../data/dms_signal/minus.tsv", sep="\t")

In [71]:
plus.shape, minus.shape

((1236653, 3), (1217741, 3))

In [72]:
plus.head()

Unnamed: 0,chr,pos,tsc
0,chrI,794,1.0
1,chrI,5881,1.0
2,chrI,6753,1.0
3,chrI,6984,1.0
4,chrI,6992,1.0


In [73]:
minus.head()

Unnamed: 0,chr,pos,tsc
0,chrI,2953,1.0
1,chrI,3637,1.0
2,chrI,3639,1.0
3,chrI,3793,1.0
4,chrI,3947,1.0


In [74]:
minus["chr"].unique()

array(['chrI', 'chrII', 'chrIII', 'chrIV', 'chrIX', 'chrMito', 'chrV',
       'chrVI', 'chrVII', 'chrVIII', 'chrX', 'chrXI', 'chrXII', 'chrXIII',
       'chrXIV', 'chrXV', 'chrXVI'], dtype=object)

In [75]:
with open("../data/R62/S288C_reference_sequence_R62-1-1_20090218.fsa") as f:
    ref = list(SeqIO.parse(f, "fasta"))

In [76]:
for r in ref:
    # get chr name
    try:
        chr = "chr" + r.description.split("chromosome=")[1].split("]")[0]
    except IndexError:
        chr = "chrMito"

    # get full plus and minus signal arrays
    signal_plus = np.zeros(len(r.seq))
    signal_minus = np.zeros(len(r.seq))
    for i, row in plus[plus["chr"] == chr].iterrows():
        signal_plus[row["pos"]] = row["tsc"]
    for i, row in minus[minus["chr"] == chr].iterrows():
        signal_minus[row["pos"]] = row["tsc"]

    # ReLU of difference between plus and minus signals
    signal = np.maximum(signal_plus - signal_minus, np.zeros(len(r.seq)))

    # 2-8% Normalization
    top10 = signal[(-signal).argsort()][:int(0.1 * len(signal))]
    signal = signal / np.mean(top10[int(0.2 * len(top10)):])

    # save to binary
    with open(f"../data/dms_signal/processed/{chr}.bin", "wb") as f:
        np.save(f, signal)