In [1]:
from pathlib import Path
import random
import re

from Bio import SeqIO
import numpy as np

In [2]:
DMS_THRESHOLD = 0.2
CONSTRAINED_THRESHOLD = 0.2

In [3]:
def parse_gene_infos(records):
    """parse gene info from fasta file (for R62 genome)"""
    infos = []
    for rec in records:
        gene_id = rec.id
        name = rec.description.split(" ")[1]
        chr = "chr" + rec.description.split(" ")[4]
        coords = rec.description.split(" ")[6]
        starts = []
        ends = []
        for c in coords[:-1].split(","):
            start = int(c.split("-")[0]) - 1
            end = int(c.split("-")[1])
            if start > end:
                start, end = end, start
            starts.append(start)
            ends.append(end)
        infos.append([gene_id, name, chr, starts, ends])
    return infos

def filter(records, query):
    regex = re.compile(query)
    for rec in records:
        if bool(regex.search(rec.description)):
            yield rec

def get_rec(transcripts, transcript_id):
    for rec in transcripts:
        if rec.id == transcript_id:
            return rec


def get_signals(chr, starts, ends):
    signals = []
    with open(f"../output/DMS_signal/{chr}.bin", "rb") as f:
        s = np.load(f)
    for start, end in zip(starts, ends):
        signals += list(s[start:end])
    return signals


def make_constraints(signals, threshold):
    return "".join([
        "x" if signals[i] > threshold
        else "." for i in range(len(signals))
    ])


def save(output_dir, rec, constraints, translate=False):
    Path(f"{output_dir}/{rec.id}").mkdir(parents=True, exist_ok=True)
    if translate:
        with open(f"{output_dir}/{rec.id}/protein.fasta", "w") as f:
            f.write(f">{rec.description} translated\n{rec.seq.translate()}\n")
    with open(f"{output_dir}/{rec.id}/nucleotide.fasta", "w") as f:
        f.write(f">{rec.description}\n{rec.seq}\n")
    with open(f"{output_dir}/{rec.id}/constrained.fasta", "w") as f:
        f.write(f">{rec.description}\n{rec.seq}\n{''.join(constraints)}\n")

### Przypisanie sygnałów dla 10 losowych mRNA

In [4]:
transcripts_path = "../data/S288C_reference_genome_R62-1-1_20090218/orf_coding_all_R62-1-1_20090220.fasta" # mRNA

with open(transcripts_path) as f:
    transcripts = list(SeqIO.parse(f, "fasta"))
infos = parse_gene_infos(transcripts)
random.shuffle(infos)
infos[:5]

[['YDL006W', 'PTC1', 'chrIV', [439906], [440752]],
 ['YLR080W', 'EMP46', 'chrXII', [287916], [289251]],
 ['YBR276C', 'PPS1', 'chrII', [757616], [760038]],
 ['YGL068W', 'MNP1', 'chrVII', [375089], [375674]],
 ['YNL111C', 'CYB5', 'chrXIV', [416942], [417303]]]

In [5]:
processed = 0
for id, _, chr, starts, ends in infos:
    if chr == "chrplasmid":
        continue
    if processed == 20:
        break
    rec = get_rec(transcripts, id)
    signals = get_signals(chr, starts, ends)
    constraints = make_constraints(signals, DMS_THRESHOLD)
    if constraints.count("x") / len(constraints) < CONSTRAINED_THRESHOLD:
        continue
    save("../output/structures/mRNA/", rec, constraints, translate=True)
    processed += 1

### Przypisanie sygnałów dla 10 losowych tRNA

In [6]:
transcripts_path = "../data/S288C_reference_genome_R62-1-1_20090218/rna_coding_R62-1-1_20090220.fasta" # tRNA / rRNA

with open(transcripts_path) as f:
    transcripts = list(SeqIO.parse(f, "fasta"))
transcripts = filter(transcripts, "tRNA")
infos = parse_gene_infos(transcripts)
random.shuffle(infos)
infos[:5]

[['tR(UCU)M1', 'tR(UCU)M1', 'chrXIII', [747891], [747961]],
 ['tR(ACG)D', 'tR(ACG)D', 'chrIV', [619967], [620038]],
 ['tS(AGA)M', 'tS(AGA)M', 'chrXIII', [259158], [259238]],
 ['tV(AAC)O', 'tV(AAC)O', 'chrXV', [663813], [663885]],
 ['tG(GCC)D1', 'tG(GCC)D1', 'chrIV', [83548], [83619]]]

In [7]:
processed = 0
for id, name, chr, starts, ends in infos:
    if processed == 10:
        break
    rec = get_rec(transcripts, id)
    if rec is None:
        continue
    signals = get_signals(chr, starts, ends)
    constraints = make_constraints(signals, DMS_THRESHOLD)
    if constraints.count("x") / len(constraints) < CONSTRAINED_THRESHOLD:
        continue
    save("../output/structures/tRNA/", rec, constraints, translate=True)
    processed += 1

### Przypisanie sygnałów dla 10 losowych rRNA

In [8]:
transcripts_path = "../data/S288C_reference_genome_R62-1-1_20090218/rna_coding_R62-1-1_20090220.fasta" # tRNA / rRNA

with open(transcripts_path) as f:
    transcripts = list(SeqIO.parse(f, "fasta"))
transcripts = filter(transcripts, "rRNA")
infos = parse_gene_infos(transcripts)
random.shuffle(infos)
infos[:5]

[['snR44', 'SNR44', 'chrXII', [856708], [856919]],
 ['snR85', 'SNR85', 'chrXIII', [67768], [67937]],
 ['snR66', 'SNR66', 'chrXIV', [586090], [586176]],
 ['snR65', 'SNR65', 'chrIII', [177178], [177278]],
 ['RDN37-1',
  'RDN37-1',
  'chrXII',
  [455934, 455415, 451787],
  [457732, 455571, 455181]]]

In [9]:
processed = 0
for id, name, chr, starts, ends in infos:
    if processed == 10:
        break
    rec = get_rec(transcripts, id)
    if rec is None:
        continue
    signals = get_signals(chr, starts, ends)
    constraints = make_constraints(signals, DMS_THRESHOLD)
    if constraints.count("x") / len(constraints) < CONSTRAINED_THRESHOLD:
        continue
    save("../output/structures/rRNA/", name, rec, constraints, translate=True)
    processed += 1