In [1]:
from pathlib import Path
import random
import re

from Bio import SeqIO
import numpy as np

In [2]:
DMS_THRESHOLD = 0.2
CONSTRAINED_THRESHOLD = 0.2

In [3]:
def parse_gene_infos(records):
    """parse gene info from fasta file (for R62 genome)"""
    infos = []
    for rec in records:
        gene_id = rec.id
        name = rec.description.split(" ")[1]
        chr = "chr" + rec.description.split(" ")[4]
        coords = rec.description.split(" ")[6]
        starts = []
        ends = []
        for c in coords[:-1].split(","):
            start = int(c.split("-")[0]) - 1
            end = int(c.split("-")[1])
            if start > end:
                start, end = end, start
            starts.append(start)
            ends.append(end)
        infos.append([gene_id, name, chr, starts, ends])
    return infos

def filter(rec, query):
    regex = re.compile(query)
    return regex.search(rec.description)

def get_rec(transcripts, transcript_id):
    for rec in transcripts:
        if rec.id == transcript_id:
            return rec


def get_signals(chr, starts, ends):
    signals = []
    with open(f"../output/DMS_signal/{chr}.bin", "rb") as f:
        s = np.load(f)
    for start, end in zip(starts, ends):
        signals += list(s[start:end])
    return signals


def make_constraints(signals, threshold):
    return "".join([
        "x" if signals[i] > threshold
        else "." for i in range(len(signals))
    ])


def save(output_dir, rec, constraints, translate=False):
    Path(f"{output_dir}/{rec.id}").mkdir(parents=True, exist_ok=True)
    if translate:
        with open(f"{output_dir}/{rec.id}/protein.fasta", "w") as f:
            f.write(f">{rec.description} translated\n{rec.seq.translate()}\n")
    with open(f"{output_dir}/{rec.id}/nucleotide.fasta", "w") as f:
        f.write(f">{rec.description}\n{rec.seq}\n")
    with open(f"{output_dir}/{rec.id}/constrained.fasta", "w") as f:
        f.write(f">{rec.description}\n{rec.seq}\n{''.join(constraints)}\n")

### Przypisanie sygnałów dla 10 losowych mRNA

In [4]:
transcripts_path = "../data/S288C_reference_genome_R62-1-1_20090218/orf_coding_all_R62-1-1_20090220.fasta" # mRNA

with open(transcripts_path) as f:
    transcripts = list(SeqIO.parse(f, "fasta"))
infos = parse_gene_infos(transcripts)
random.shuffle(infos)
infos[:5]

[['YAL049C', 'YAL049C', 'chrI', [51856], [52595]],
 ['YOL005C', 'RPB11', 'chrXV', [315814], [316175]],
 ['YMR217W', 'GUA1', 'chrXIII', [701788], [703366]],
 ['YNL326C', 'PFA3', 'chrXIV', [27336], [28345]],
 ['YOR186W', 'YOR186W', 'chrXV', [683111], [683546]]]

In [5]:
processed = 0
for id, _, chr, starts, ends in infos:
    if chr == "chrplasmid":
        continue
    if processed == 20:
        break
    rec = get_rec(transcripts, id)
    signals = get_signals(chr, starts, ends)
    constraints = make_constraints(signals, DMS_THRESHOLD)
    if constraints.count("x") / len(constraints) < CONSTRAINED_THRESHOLD:
        continue
    save("../output/structures/mRNA/", rec, constraints, translate=True)
    processed += 1

### Przypisanie sygnałów dla 10 losowych tRNA

In [6]:
transcripts_path = "../data/S288C_reference_genome_R62-1-1_20090218/rna_coding_R62-1-1_20090220.fasta" # tRNA / rRNA

with open(transcripts_path) as f:
    transcripts = list(SeqIO.parse(f, "fasta"))
transcripts = [rec for rec in transcripts if filter(rec, "tRNA")]
infos = parse_gene_infos(transcripts)
random.shuffle(infos)
infos[:5]

[['tT(UGU)G1', 'tT(UGU)G1', 'chrVII', [661752], [661824]],
 ['tT(CGU)K', 'TRT2', 'chrXI', [46736], [46806]],
 ['tV(AAC)J', 'tV(AAC)J', 'chrX', [378353], [378427]],
 ['tG(GCC)E', 'tG(GCC)E', 'chrV', [61890], [61959]],
 ['tT(AGU)N2', 'tT(AGU)N2', 'chrXIV', [560694], [560765]]]

In [7]:
processed = 0
for id, name, chr, starts, ends in infos:
    if processed == 10:
        break
    rec = get_rec(transcripts, id)
    if rec is None:
        continue
    signals = get_signals(chr, starts, ends)
    constraints = make_constraints(signals, DMS_THRESHOLD)
    if constraints.count("x") / len(constraints) < CONSTRAINED_THRESHOLD:
        continue
    save("../output/structures/tRNA/", rec, constraints)
    processed += 1

### Przypisanie sygnałów dla 10 losowych rRNA

In [8]:
transcripts_path = "../data/S288C_reference_genome_R62-1-1_20090218/rna_coding_R62-1-1_20090220.fasta" # tRNA / rRNA

with open(transcripts_path) as f:
    transcripts = list(SeqIO.parse(f, "fasta"))
transcripts = [rec for rec in transcripts if filter(rec, "rRNA")]
infos = parse_gene_infos(transcripts)
random.shuffle(infos)
infos[:5]

[['snR61', 'SNR61', 'chrXII', [794486], [794574]],
 ['RDN25-2', 'RDN25-2', 'chrXII', [460924], [464318]],
 ['snR81', 'SNR81', 'chrXV', [234344], [234545]],
 ['RDN58-1', 'RDN58-1', 'chrXII', [455415], [455571]],
 ['snR82', 'SNR82', 'chrVII', [316790], [317058]]]

In [9]:
processed = 0
for id, name, chr, starts, ends in infos:
    if processed == 10:
        break
    rec = get_rec(transcripts, id)
    if rec is None:
        continue
    signals = get_signals(chr, starts, ends)
    constraints = make_constraints(signals, DMS_THRESHOLD)
    if constraints.count("x") / len(constraints) < CONSTRAINED_THRESHOLD:
        continue
    save("../output/structures/rRNA/", rec, constraints)
    processed += 1