In [15]:
from pathlib import Path
from time import sleep
import xml.etree.ElementTree as ET

from Bio import SeqIO, Entrez
import pandas as pd

In [16]:
Entrez.email = "hajdylaf@gmail.com"

In [17]:
def get_gene_info(gene_id):
    handle = Entrez.esearch(db="gene", term=gene_id)
    record = Entrez.read(handle)
    handle.close()

    handle = Entrez.efetch(db="gene", id=record["IdList"][0], rettype="docsum")
    tree = ET.parse(handle)
    handle.close()

    name = tree.find(".//Name").text
    chr = "chr" + tree.find(".//Chromosome").text
    start = tree.find(".//ChrStart").text
    return [gene_id, name, chr, start]


def find_save_rec(transcripts, gene_name, transcript_id):
    for rec in transcripts:
        if rec.id == transcript_id:
            Path(f"../output/{gene_name}").mkdir(parents=True, exist_ok=True)
            with open(f"../output/{gene_name}/seq.fasta", "w") as f:
                f.write(f">{rec.id}\n{rec.seq}\n")
            return rec


def get_signals(sel_chr, rec):
    signals = []
    for idx, nt in enumerate(rec.seq):
        if idx in sel_chr["pos"].values and nt in "AT":
            signals.append(
                [nt, sel_chr[sel_chr["pos"] == idx]["signal"].values[0]]
            )
        else:
            signals.append([nt, 0])
    signals = pd.DataFrame(signals, columns=["nt", "signal"])
    signals["signal"] = signals["signal"] / signals["signal"].max()
    return signals


def make_save_constraints(signals, rec, gene_name, threshold=0.2):
    constraints = [
        "x" if signals["signal"].iloc[i] > threshold
        else "." for i in range(len(signals))
    ]
    with open(f"../output/{gene_name}/constrained.fasta", "w") as f:
        f.write(f">{rec.id}\n{rec.seq}\n{''.join(constraints)}\n")

In [18]:
# with open("../data/derived/tRNA.txt") as f:
#     trnas = [line.strip() for line in f.readlines()]

# lines = []
# transcripts = SeqIO.parse("../data/R64/rna.fna", "fasta")
# for rec in transcripts:
#     if rec.id not in trnas: # only tRNA
#         continue
#     info = get_gene_info(rec.id)
#     line = "\t".join(info) + "\n"
#     lines.append(line)
#     sleep(0.1)

# with open("../data/derived/tRNAinfo.tsv", "w") as f:
    # f.writelines(lines)

In [19]:
with open("../data/derived/tRNAinfo.tsv") as f:
    infos = [line.strip().split("\t") for line in f.readlines()]
infos[:5]

[['NM_001178212.1', 'SEN34', 'chrI', '158965'],
 ['NM_001178253.1', 'FMT1', 'chrII', '202055'],
 ['NM_001178264.1', 'NCL1', 'chrII', '172533'],
 ['NM_001178297.2', 'PTH2', 'chrII', '113426'],
 ['NM_001178316.1', 'ILS1', 'chrII', '84260']]

In [20]:
THRESHOLD = 0.2

df = pd.read_csv("../data/dms_signal/combined.tsv", sep="\t")
df["chr"].unique()

array(['chrI', 'chrII', 'chrIII', 'chrIV', 'chrIX', 'chrMito', 'chrV',
       'chrVI', 'chrVII', 'chrVIII', 'chrX', 'chrXI', 'chrXII', 'chrXIII',
       'chrXIV', 'chrXV', 'chrXVI'], dtype=object)

In [21]:
for TRANSCRIPT_ID, GENE_NAME, CHR, TRANSCRIPT_START in infos:
    TRANSCRIPT_START = int(TRANSCRIPT_START)
    sel_chr = df[df["chr"] == CHR]
    sel_chr.loc[:, "pos"] = sel_chr["pos"] - TRANSCRIPT_START
    sel_chr = sel_chr[sel_chr["pos"] >= 0]
    transcripts = SeqIO.parse("../data/R64/rna.fna", "fasta")
    rec = find_save_rec(transcripts, GENE_NAME, TRANSCRIPT_ID)
    transcript_signals = get_signals(sel_chr, rec)
    make_save_constraints(transcript_signals, rec, GENE_NAME, THRESHOLD)