In [7]:
from pathlib import Path
import random

from Bio import SeqIO, Entrez
import pandas as pd

In [8]:
Entrez.email = "hajdylaf@gmail.com"

In [9]:
def parse_gene_infos(records, filter=None):
    """parse gene info from fasta file (for R62 genome)"""
    infos = []
    for rec in records:
        if filter is not None:
            if filter not in rec.description:
                continue
        gene_id = rec.id
        name = rec.description.split(" ")[1]
        chr = "chr" + rec.description.split(" ")[4]
        coords = rec.description.split(" ")[6]
        if coords.count("-") > 1:
            continue
        coords = coords.replace(",", "")
        start = int(coords.split("-")[0])
        end = int(coords.split("-")[1])
        if start > end:
            continue
        infos.append([gene_id, name, chr, start])
    return infos

def find_save_rec(transcripts, gene_name, transcript_id):
    for rec in transcripts:
        if rec.id == transcript_id:
            Path(f"../output/{gene_name}").mkdir(parents=True, exist_ok=True)
            with open(f"../output/{gene_name}/seq.fasta", "w") as f:
                f.write(f">{rec.id}\n{rec.seq}\n")
            return rec


def get_signals(sel_chr, rec):
    signals = []
    for idx, nt in enumerate(rec.seq):
        if idx in sel_chr["pos"].values and nt in "AT":
            signals.append(
                [nt, sel_chr[sel_chr["pos"] == idx]["signal"].values[0]]
            )
        else:
            signals.append([nt, 0])
    signals = pd.DataFrame(signals, columns=["nt", "signal"])
    signals["signal"] = signals["signal"] / signals["signal"].max()
    return signals


def make_save_constraints(signals, rec, gene_name, threshold=0.2):
    constraints = [
        "x" if signals["signal"].iloc[i] > threshold
        else "." for i in range(len(signals))
    ]
    with open(f"../output/{gene_name}/constrained.fasta", "w") as f:
        f.write(f">{rec.id}\n{rec.seq}\n{''.join(constraints)}\n")

In [10]:
with open("../data/R62/rna.fna") as f:
    records = list(SeqIO.parse(f, "fasta"))
infos = parse_gene_infos(records)
infos[:5]

[['YAL002W', 'VPS8', 'chrI', 143709],
 ['YAL004W', 'YAL004W', 'chrI', 140762],
 ['YAL008W', 'FUN14', 'chrI', 136916],
 ['YAL009W', 'SPO7', 'chrI', 135856],
 ['YAL011W', 'SWC3', 'chrI', 132202]]

In [11]:
THRESHOLD = 0.1

df = pd.read_csv("../data/dms_signal/combined.tsv", sep="\t")
df["chr"].unique()

array(['chrI', 'chrII', 'chrIII', 'chrIV', 'chrIX', 'chrMito', 'chrV',
       'chrVI', 'chrVII', 'chrVIII', 'chrX', 'chrXI', 'chrXII', 'chrXIII',
       'chrXIV', 'chrXV', 'chrXVI'], dtype=object)

In [12]:
for TRANSCRIPT_ID, GENE_NAME, CHR, TRANSCRIPT_START in random.choices(infos, k=10):
    TRANSCRIPT_START = int(TRANSCRIPT_START)
    sel_chr = df[df["chr"] == CHR]
    sel_chr.loc[:, "pos"] = sel_chr["pos"] - TRANSCRIPT_START
    sel_chr = sel_chr[sel_chr["pos"] >= 0]
    transcripts = SeqIO.parse("../data/R62/rna.fna", "fasta")
    rec = find_save_rec(transcripts, GENE_NAME, TRANSCRIPT_ID)
    transcript_signals = get_signals(sel_chr, rec)
    make_save_constraints(transcript_signals, rec, GENE_NAME, THRESHOLD)