In [1]:
from pathlib import Path
import random
import re

from Bio import SeqIO
import numpy as np

In [2]:
THRESHOLD = 0.2

In [3]:
def parse_gene_infos(records):
    """parse gene info from fasta file (for R62 genome)"""
    infos = []
    for rec in records:
        gene_id = rec.id
        name = rec.description.split(" ")[1]
        chr = "chr" + rec.description.split(" ")[4]
        coords = rec.description.split(" ")[6]
        starts = []
        ends = []
        for c in coords[:-1].split(","):
            start = int(c.split("-")[0]) - 1
            end = int(c.split("-")[1])
            if start > end:
                start, end = end, start
            starts.append(start)
            ends.append(end)
        infos.append([gene_id, name, chr, starts, ends])
    return infos

def filter(records, query):
    regex = re.compile(query)
    for rec in records:
        if bool(regex.search(rec.description)):
            yield rec

def find_save_rec(transcripts, gene_name, transcript_id, translate, output_dir):
    for rec in transcripts:
        if rec.id == transcript_id:
            Path(f"{output_dir}/{gene_name}").mkdir(parents=True, exist_ok=True)
            with open(f"{output_dir}/{gene_name}/nucleotide.fasta", "w") as f:
                f.write(f">{rec.description}\n{rec.seq}\n")
            if translate:
                with open(f"{output_dir}/{gene_name}/protein.fasta", "w") as f:
                    f.write(f">{rec.description} translated\n{rec.seq.translate()}\n")
            return rec


def get_signals(chr, starts, ends):
    signals = []
    with open(f"../output/DMS_signal/{chr}.bin", "rb") as f:
        s = np.load(f)
    for start, end in zip(starts, ends):
        signals += list(s[start:end])
    return signals


def make_save_constraints(signals, rec, gene_name, output_dir, threshold):
    constraints = [
        "x" if signals[i] > threshold
        else "." for i in range(len(signals))
    ]
    with open(f"{output_dir}/{gene_name}/constrained.fasta", "w") as f:
        f.write(f">{rec.description}\n{rec.seq}\n{''.join(constraints)}\n")

### Przypisanie sygnałów dla 10 losowych mRNA

In [4]:
transcripts_path = "../data/S288C_reference_genome_R62-1-1_20090218/orf_coding_all_R62-1-1_20090220.fasta" # mRNA

with open(transcripts_path) as f:
    records = list(SeqIO.parse(f, "fasta"))
infos = parse_gene_infos(records)
infos[:5]

[['YAL001C', 'TFC3', 'chrI', [151099, 147596], [151167, 151007]],
 ['YAL002W', 'VPS8', 'chrI', [143708], [147533]],
 ['YAL003W', 'EFB1', 'chrI', [142175, 142621], [142255, 143162]],
 ['YAL004W', 'YAL004W', 'chrI', [140761], [141409]],
 ['YAL005C', 'SSA1', 'chrI', [139505], [141432]]]

In [5]:
genes_to_process = random.sample(infos, k=10)
[id for id, _, _, _, _ in genes_to_process]

['YGR064W',
 'YDL240W',
 'YDR492W',
 'YLR265C',
 'YIL096C',
 'YPL137C',
 'YLR255C',
 'YER171W',
 'YLR227W-B',
 'YOL040C']

In [6]:
for id, name, chr, starts, ends in genes_to_process:
    transcripts = SeqIO.parse(transcripts_path, "fasta")
    rec = find_save_rec(transcripts, name, id, True, "../output/structures/mRNA")
    signals = get_signals(chr, starts, ends)
    make_save_constraints(signals, rec, name, "../output/structures/mRNA", THRESHOLD)

### Przypisanie sygnałów dla 10 losowych tRNA

In [7]:
transcripts_path = "../data/S288C_reference_genome_R62-1-1_20090218/rna_coding_R62-1-1_20090220.fasta" # tRNA / rRNA

with open(transcripts_path) as f:
    records = list(SeqIO.parse(f, "fasta"))
records = filter(records, "tRNA")
infos = parse_gene_infos(records)
infos[:5]

[['tA(UGC)A', 'tA(UGC)A', 'chrI', [166267], [166340]],
 ['tL(CAA)A', 'SUP56', 'chrI', [181134, 181204], [181172, 181248]],
 ['tP(UGG)A', 'TRN1', 'chrI', [139153, 139220], [139189, 139256]],
 ['tS(AGA)A', 'tS(AGA)A', 'chrI', [182516], [182596]],
 ['tC(GCA)B', 'tC(GCA)B', 'chrII', [643002], [643072]]]

In [8]:
genes_to_process = random.sample(infos, k=10)
[id for id, _, _, _, _ in genes_to_process]

['tC(GCA)P1',
 'tL(UAA)B2',
 'tA(UGC)G',
 'tK(UUU)P',
 'tE(UUC)E2',
 'tM(CAU)M',
 'tA(AGC)L',
 'tR(ACG)L',
 'tG(GCC)O2',
 'tW(CCA)M']

In [9]:
for id, name, chr, starts, ends in genes_to_process:
    transcripts = SeqIO.parse(transcripts_path, "fasta")
    transcripts = filter(transcripts, "tRNA")
    rec = find_save_rec(transcripts, name, id, False, "../output/structures/tRNA")
    signals = get_signals(chr, starts, ends)
    make_save_constraints(signals, rec, name, "../output/structures/tRNA", THRESHOLD)

### Przypisanie sygnałów dla 10 losowych rRNA

In [10]:
transcripts_path = "../data/S288C_reference_genome_R62-1-1_20090218/rna_coding_R62-1-1_20090220.fasta" # tRNA / rRNA

with open(transcripts_path) as f:
    records = list(SeqIO.parse(f, "fasta"))
records = filter(records, "rRNA")
infos = parse_gene_infos(records)
infos[:5]

[['HRA1', 'HRA1', 'chrI', [99305], [99869]],
 ['snR18', 'SNR18', 'chrI', [142368], [142470]],
 ['snR161', 'SNR161', 'chrII', [307185], [307344]],
 ['snR56', 'SNR56', 'chrII', [88187], [88275]],
 ['snR189', 'SNR189', 'chrIII', [178606], [178793]]]

In [11]:
genes_to_process = random.sample(infos, k=10)
[id for id, _, _, _, _ in genes_to_process]

['RDN37-1',
 'snR74',
 'snR35',
 'snR34',
 'snR78',
 'snR31',
 'RDN25-2',
 'snR86',
 'snR47',
 'RDN18-1']

In [12]:
for id, name, chr, starts, ends in genes_to_process:
    transcripts = SeqIO.parse(transcripts_path, "fasta")
    transcripts = filter(transcripts, "rRNA")
    rec = find_save_rec(transcripts, name, id, False, "../output/structures/rRNA")
    signals = get_signals(chr, starts, ends)
    make_save_constraints(signals, rec, name, "../output/structures/rRNA", THRESHOLD)