In [1]:
from pathlib import Path
import random
import re

from Bio import SeqIO
import numpy as np

In [2]:
DMS_THRESHOLD = 0.2
CONSTRAINED_THRESHOLD = 0.2

In [3]:
def parse_gene_infos(records):
    """parse gene info from fasta file (for R62 genome)"""
    infos = []
    for rec in records:
        gene_id = rec.id
        name = rec.description.split(" ")[1]
        chr = "chr" + rec.description.split(" ")[4]
        coords = rec.description.split(" ")[6]
        starts = []
        ends = []
        for c in coords[:-1].split(","):
            start = int(c.split("-")[0]) - 1
            end = int(c.split("-")[1])
            if start > end:
                start, end = end, start
            starts.append(start)
            ends.append(end)
        infos.append([gene_id, name, chr, starts, ends])
    return infos

def filter(records, query):
    regex = re.compile(query)
    for rec in records:
        if bool(regex.search(rec.description)):
            yield rec

def get_rec(transcripts, transcript_id):
    for rec in transcripts:
        if rec.id == transcript_id:
            return rec


def get_signals(chr, starts, ends):
    signals = []
    with open(f"../output/DMS_signal/{chr}.bin", "rb") as f:
        s = np.load(f)
    for start, end in zip(starts, ends):
        signals += list(s[start:end])
    return signals


def make_constraints(signals, threshold):
    return "".join([
        "x" if signals[i] > threshold
        else "." for i in range(len(signals))
    ])


def save(output_dir, rec, constraints, translate=False):
    Path(f"{output_dir}/{rec.id}").mkdir(parents=True, exist_ok=True)
    if translate:
        with open(f"{output_dir}/{rec.id}/protein.fasta", "w") as f:
            f.write(f">{rec.description} translated\n{rec.seq.translate()}\n")
    with open(f"{output_dir}/{rec.id}/nucleotide.fasta", "w") as f:
        f.write(f">{rec.description}\n{rec.seq}\n")
    with open(f"{output_dir}/{rec.id}/constrained.fasta", "w") as f:
        f.write(f">{rec.description}\n{rec.seq}\n{''.join(constraints)}\n")

### Przypisanie sygnałów dla 10 losowych mRNA

In [4]:
transcripts_path = "../data/S288C_reference_genome_R62-1-1_20090218/orf_coding_all_R62-1-1_20090220.fasta" # mRNA

with open(transcripts_path) as f:
    transcripts = list(SeqIO.parse(f, "fasta"))
infos = parse_gene_infos(transcripts)
random.shuffle(infos)
infos[:5]

[['YKL173W', 'SNU114', 'chrXI', [122521], [125548]],
 ['YML088W', 'UFO1', 'chrXIII', [92234], [94241]],
 ['YHR073W-A', 'YHR073W-A', 'chrVIII', [242869], [243046]],
 ['YLR034C', 'SMF3', 'chrXII', [210513], [211933]],
 ['YCR099C', 'YCR099C', 'chrIII', [300829], [301295]]]

In [6]:
processed = 0
for id, _, chr, starts, ends in infos:
    if processed == 20:
        break
    rec = get_rec(transcripts, id)
    signals = get_signals(chr, starts, ends)
    constraints = make_constraints(signals, DMS_THRESHOLD)
    if constraints.count("x") / len(constraints) < CONSTRAINED_THRESHOLD:
        continue
    save("../output/structures/mRNA/", rec, constraints, translate=True)
    processed += 1

### Przypisanie sygnałów dla 10 losowych tRNA

In [None]:
transcripts_path = "../data/S288C_reference_genome_R62-1-1_20090218/rna_coding_R62-1-1_20090220.fasta" # tRNA / rRNA

with open(transcripts_path) as f:
    transcripts = list(SeqIO.parse(f, "fasta"))
transcripts = filter(transcripts, "tRNA")
infos = parse_gene_infos(transcripts)
random.shuffle(infos)
infos[:5]

[['tW(CCA)G1', 'tW(CCA)G1', 'chrVII', [287426, 287356], [287460, 287390]],
 ['tK(CUU)J', 'tK(CUU)J', 'chrX', [414958], [415031]],
 ['tK(UUU)D', 'tK(UUU)D', 'chrIV', [359576, 359636], [359613, 359672]],
 ['tF(GAA)P1', 'tF(GAA)P1', 'chrXVI', [560250, 560195], [560285, 560229]],
 ['tQ(UUG)H', 'tQ(UUG)H', 'chrVIII', [134313], [134385]]]

In [None]:
processed = 0
for id, name, chr, starts, ends in infos:
    if processed == 10:
        break
    rec = get_rec(transcripts, id)
    signals = get_signals(chr, starts, ends)
    constraints = make_constraints(signals, DMS_THRESHOLD)
    if constraints.count("x") / len(constraints) < CONSTRAINED_THRESHOLD:
        continue
    save("../output/structures/tRNA/", name, rec, constraints, translate=True)
    processed += 1

### Przypisanie sygnałów dla 10 losowych rRNA

In [None]:
transcripts_path = "../data/S288C_reference_genome_R62-1-1_20090218/rna_coding_R62-1-1_20090220.fasta" # tRNA / rRNA

with open(transcripts_path) as f:
    transcripts = list(SeqIO.parse(f, "fasta"))
transcripts = filter(transcripts, "rRNA")
infos = parse_gene_infos(transcripts)
random.shuffle(infos)
infos[:5]

[['snR9', 'SNR9', 'chrXV', [407948], [408133]],
 ['snR161', 'SNR161', 'chrII', [307185], [307344]],
 ['snR56', 'SNR56', 'chrII', [88187], [88275]],
 ['snR49', 'SNR49', 'chrXIV', [716121], [716286]],
 ['snR190', 'SNR190', 'chrX', [139757], [139945]]]

In [None]:
processed = 0
for id, name, chr, starts, ends in infos:
    if processed == 10:
        break
    rec = get_rec(transcripts, id)
    if rec is None:
        continue
    signals = get_signals(chr, starts, ends)
    constraints = make_constraints(signals, DMS_THRESHOLD)
    if constraints.count("x") / len(constraints) < CONSTRAINED_THRESHOLD:
        continue
    save("../output/structures/rRNA/", name, rec, constraints, translate=True)
    processed += 1