In [1]:
from pathlib import Path
import random

from Bio import SeqIO
import numpy as np

In [2]:
THRESHOLD = 0.2

In [3]:
def parse_gene_infos(records, filter=None):
    """parse gene info from fasta file (for R62 genome)"""
    infos = []
    for rec in records:
        if filter is not None:
            if filter not in rec.description:
                continue
        gene_id = rec.id
        name = rec.description.split(" ")[1]
        chr = "chr" + rec.description.split(" ")[4]
        coords = rec.description.split(" ")[6]
        if coords.count("-") > 1:
            continue
        coords = coords.replace(",", "")
        start = int(coords.split("-")[0]) - 1
        end = int(coords.split("-")[1])
        if start > end:
            continue
        infos.append([gene_id, name, chr, start, end])
    return infos

def find_save_rec(transcripts, gene_name, transcript_id):
    for rec in transcripts:
        if rec.id == transcript_id:
            Path(f"../output/{gene_name}").mkdir(parents=True, exist_ok=True)
            with open(f"../output/{gene_name}/seq.fasta", "w") as f:
                f.write(f">{rec.id}\n{rec.seq}\n")
            return rec


def get_signals(chr, start, end):
    with open(f"../data/dms_signal/processed/{chr}.bin", "rb") as f:
        signals = np.load(f)
    return signals[start:end]


def make_save_constraints(signals, rec, gene_name, threshold=0.2):
    constraints = [
        "x" if signals[i] > threshold
        else "." for i in range(len(signals))
    ]
    with open(f"../output/{gene_name}/constrained.fasta", "w") as f:
        f.write(f">{rec.id}\n{rec.seq}\n{''.join(constraints)}\n")

In [4]:
with open("../data/R62/rna.fna") as f:
    records = list(SeqIO.parse(f, "fasta"))
infos = parse_gene_infos(records)
infos[:5]

[['YAL002W', 'VPS8', 'chrI', 143708, 147533],
 ['YAL004W', 'YAL004W', 'chrI', 140761, 141409],
 ['YAL008W', 'FUN14', 'chrI', 136915, 137512],
 ['YAL009W', 'SPO7', 'chrI', 135855, 136635],
 ['YAL011W', 'SWC3', 'chrI', 132201, 134079]]

In [5]:
genes_to_process = random.sample(infos, k=10)
# genes_to_process = []  # set manually
[id for id, _, _, _, _ in genes_to_process]

['YJL034W',
 'YKR041W',
 'YOL130W',
 'YDR418W',
 'YDR206W',
 'YJR116W',
 'YLR296W',
 'YNR030W',
 'YBR142W',
 'YMR035W']

In [6]:
for id, name, chr, start, end in genes_to_process:
    start = int(start)
    transcripts = SeqIO.parse("../data/R62/rna.fna", "fasta")
    rec = find_save_rec(transcripts, name, id)
    signals = get_signals(chr, start, end)
    make_save_constraints(signals, rec, name, THRESHOLD)