In [1]:
from pathlib import Path
import random
import re

from Bio import SeqIO
import pandas as pd

In [2]:
!rm -rf ../output/structures/*

In [3]:
UNPAIRED_THRESHOLD = 0.2
PAIRED_THRESHOLD = 0.01
CONSTRAINED_THRESHOLD = 0

In [4]:
def parse_gene_infos(records):
    """parse gene info from fasta file (for R62 genome)"""
    infos = []
    for rec in records:
        name = rec.description.split(" ")[1]
        chr = "chr" + rec.description.split(" ")[4]
        coords = rec.description.split(" ")[6]
        starts = []
        ends = []
        for c in coords[:-1].split(","):
            start = int(c.split("-")[0]) - 1
            end = int(c.split("-")[1])
            if start > end:
                start, end = end, start
            starts.append(start)
            ends.append(end)
        infos.append([rec.id, name, chr, starts, ends, rec.seq])
    return infos

def filter(rec, query):
    regex = re.compile(query)
    return regex.search(rec.description)

def get_rec(transcripts, transcript_id):
    for rec in transcripts:
        if rec.id == transcript_id:
            return rec


def get_signals(chr, seq, starts, ends):
    signals = []
    df = pd.read_csv(f"../output/DMS_signal/{chr}.tsv", sep="\t")
    refseq = ""
    for start, end in zip(starts, ends):
        signal = df["signal"].values[start:end]
        refseq += "".join(df["base"].values[start:end])
        signals += list(signal)
    try:
        assert str(seq) == refseq
    except AssertionError:
        return None
    return signals


def make_constraint(signals, seq, unpaired_threshold, paired_threshold):
    constraint = []
    for signal, nt in zip(signals, seq):
        if signal > unpaired_threshold:
            constraint.append("x")
        elif signal < paired_threshold and nt in "AC":
            constraint.append("|")
        else:
            constraint.append(".")
    return "".join(constraint)


def save(output_dir, rec, constraints, translate=False):
    Path(f"{output_dir}/{rec.id}").mkdir(parents=True, exist_ok=True)
    if translate:
        with open(f"{output_dir}/{rec.id}/protein.fasta", "w") as f:
            f.write(f">{rec.description} translated\n{rec.seq.translate()}\n")
    with open(f"{output_dir}/{rec.id}/nucleotide.fasta", "w") as f:
        f.write(f">{rec.description}\n{rec.seq}\n")
    with open(f"{output_dir}/{rec.id}/constrained.fasta", "w") as f:
        f.write(f">{rec.description}\n{rec.seq}\n{''.join(constraints)}\n")

### Przypisanie sygnałów dla 10 losowych mRNA

In [5]:
transcripts_path = "../data/S288C_reference_genome_R62-1-1_20090218/orf_coding_all_R62-1-1_20090220.fasta" # mRNA

with open(transcripts_path) as f:
    transcripts = list(SeqIO.parse(f, "fasta"))
infos = parse_gene_infos(transcripts)
random.shuffle(infos)

In [6]:
processed = 0
for id, _, chr, starts, ends, seq in infos:
    if chr == "chrplasmid":
        continue
    if processed == 10:
        break
    rec = get_rec(transcripts, id)
    signals = get_signals(chr, seq, starts, ends)
    if signals is None:
        continue
    constraint = make_constraint(signals, seq, UNPAIRED_THRESHOLD, PAIRED_THRESHOLD)
    if constraint.count("x") / len(constraint) < CONSTRAINED_THRESHOLD:
        continue
    save("../output/structures/mRNA/", rec, constraint, translate=True)
    processed += 1

### Przypisanie sygnałów dla 10 losowych tRNA

In [7]:
transcripts_path = "../data/S288C_reference_genome_R62-1-1_20090218/rna_coding_R62-1-1_20090220.fasta" # tRNA / rRNA

with open(transcripts_path) as f:
    transcripts = list(SeqIO.parse(f, "fasta"))
transcripts = [rec for rec in transcripts if filter(rec, "tRNA")]
infos = parse_gene_infos(transcripts)
random.shuffle(infos)
infos[:5]

[['tS(GCU)L',
  'tS(GCU)L',
  'chrXII',
  [784353, 784409],
  [784389, 784453],
  Seq('GTCCCAGTGGCCGAGTGGTTAAGGCGATGCCTGCTAGGCATTGGGTTTTACCTG...ACG')],
 ['tR(ACG)D',
  'tR(ACG)D',
  'chrIV',
  [619967],
  [620038],
  Seq('TTCCTCGTGGCCCAATGGTCACGGCGTCTGGCTACGAACCAGAAGATTCCAGGT...AAG')],
 ['tD(GUC)J3',
  'tD(GUC)J3',
  'chrX',
  [374418],
  [374488],
  Seq('TCCGTGATAGTTTAATGGTCAGAATGGGCGCTTGTCGCGTGCCAGATCGGGGTT...GAG')],
 ['tH(GUG)H',
  'tH(GUG)H',
  'chrVIII',
  [62752],
  [62824],
  Seq('GCCATCTTAGTATAGTGGTTAGTACACATCGTTGTGGCCGATGAAACCCTGGTT...GCA')],
 ['tK(UUU)K',
  'tK(UUU)K',
  'chrXI',
  [578606, 578666],
  [578643, 578702],
  Seq('TCCTTGTTAGCTCAGTTGGTAGAGCGTTCGGCTTTTAACCGAAATGTCAGGGGT...GAG')]]

In [8]:
processed = 0
for id, name, chr, starts, ends, seq in infos:
    if processed == 10:
        break
    rec = get_rec(transcripts, id)
    if rec is None:
        continue
    signals = get_signals(chr, seq, starts, ends)
    if signals is None:
        continue
    constraint = make_constraint(signals, seq, UNPAIRED_THRESHOLD, PAIRED_THRESHOLD)
    if constraint.count("x") / len(constraint) < CONSTRAINED_THRESHOLD:
        continue
    save("../output/structures/tRNA/", rec, constraint)
    processed += 1

### Przypisanie sygnałów dla 10 losowych rRNA

In [9]:
transcripts_path = "../data/S288C_reference_genome_R62-1-1_20090218/rna_coding_R62-1-1_20090220.fasta" # tRNA / rRNA

with open(transcripts_path) as f:
    transcripts = list(SeqIO.parse(f, "fasta"))
transcripts = [rec for rec in transcripts if filter(rec, "rRNA")]
infos = parse_gene_infos(transcripts)
random.shuffle(infos)
infos[:5]

[['snR56',
  'SNR56',
  'chrII',
  [88187],
  [88275],
  Seq('TTAACATGATGAAAAAATATATTAACACAGACCTGTACTGAACTTTTCGAAGTT...ACT')],
 ['snR66',
  'SNR66',
  'chrXIV',
  [586090],
  [586176],
  Seq('ATCAAATGATGAAATACCAATGCAACAGAGTCAAGCTCTGAGTTTCAAAAAGAA...ACC')],
 ['snR87',
  'SNR87',
  'chrXI',
  [430673],
  [430780],
  Seq('GTAACTGAATGATGATATAATTTGCGATCTAGGGCTAATCACTTGGAACACCGC...TTC')],
 ['snR53',
  'SNR53',
  'chrV',
  [61698],
  [61789],
  Seq('TTTGATGATGATTACACTCCATGCTAATCATGAACGTGTTCGATGTAAATTTGA...AAA')],
 ['snR82',
  'SNR82',
  'chrVII',
  [316790],
  [317058],
  Seq('ATGGCTCTTCAACACATTTCAACATGTTCAAGTAATTTGTGTTAGTGGATGACC...TTT')]]

In [10]:
processed = 0
for id, name, chr, starts, ends, seq in infos:
    if processed == 10:
        break
    rec = get_rec(transcripts, id)
    if rec is None:
        continue
    signals = get_signals(chr, seq, starts, ends)
    if signals is None:
        continue
    constraint = make_constraint(signals, seq, UNPAIRED_THRESHOLD, PAIRED_THRESHOLD)
    if constraint.count("x") / len(constraint) < CONSTRAINED_THRESHOLD:
        continue
    save("../output/structures/rRNA/", rec, constraint)
    processed += 1