In [1]:
from Bio import Entrez, SeqIO
from io import StringIO

In [2]:
Entrez.email = "hajdylaf@gmail.com"

In [3]:
with open("../output_yeast_pre_threshold/structures/rRNA/blastn-hits.csv") as f:
    lines = f.readlines()
lines = [line.strip().split(",") for line in lines if line.strip()]

In [4]:
homologs = {}
for line in lines:
    query_id = line[0]
    subject_id = line[1]
    query_start = int(line[8]) - 1
    query_stop = int(line[9])
    print("Fetching", subject_id)
    with Entrez.efetch(db="nucleotide", id=subject_id, rettype="fasta", retmode="text") as handle:
        rec = list(SeqIO.parse(handle, "fasta"))[0]
    rec.seq = rec.seq[query_start:query_stop]
    if not rec.seq:
        print("No sequence found for", subject_id)
        continue
    if query_id not in homologs:
        homologs[query_id] = [rec]
    else:
        homologs[query_id] += [rec]

Fetching CP006315.2
No sequence found for CP006315.2
Fetching CP006305.2
No sequence found for CP006305.2
Fetching CP006295.2
No sequence found for CP006295.2
Fetching CP006285.2
No sequence found for CP006285.2
Fetching CP006275.2
No sequence found for CP006275.2
Fetching CP006355.2
No sequence found for CP006355.2
Fetching CP006335.2
No sequence found for CP006335.2
Fetching CP129570.1
No sequence found for CP129570.1
Fetching CP125416.1
No sequence found for CP125416.1
Fetching CP125399.1
No sequence found for CP125399.1
Fetching AP027353.1
No sequence found for AP027353.1
Fetching AP026841.1
No sequence found for AP026841.1
Fetching CP089109.1
No sequence found for CP089109.1
Fetching CP059531.2
No sequence found for CP059531.2
Fetching BK006943.2
No sequence found for BK006943.2
Fetching X85021.1
No sequence found for X85021.1
Fetching Z49379.1
No sequence found for Z49379.1
Fetching Z49378.1
No sequence found for Z49378.1
Fetching NR_132195.1
Fetching CP092958.1
Fetching CP005375

In [6]:
for query_id in homologs:
    with open(f"../output_yeast_pre_threshold/structures/rRNA/{query_id.upper()}/seqdump_nn.txt", "w") as f:
        SeqIO.write(homologs[query_id], f, "fasta")

In [6]:
with open("../output/structures/mRNA/blastp-hits.csv") as f:
    lines = f.readlines()
lines = [line.strip().split(",") for line in lines if line.strip()]

In [7]:
homologs = {}
guides = {}
for line in lines:
    query_id = line[0]
    subject_id = line[1]
    query_start = int(line[8]) - 1
    query_stop = int(line[9])

    print("Fetching", subject_id)
    with Entrez.efetch(
        db="protein", id=subject_id, rettype="gb", retmode="text"
    ) as handle:
        gb = handle.read()
    if not gb:
        print()
        continue
    if "/coded_by=" not in gb:
        print("No CDS region found for", subject_id)
        continue

    aa_rec = SeqIO.read(StringIO(gb), "genbank").format("fasta")
    aa_rec = SeqIO.read(StringIO(aa_rec), "fasta")
    if not aa_rec.seq:
        print("No sequence found for", subject_id)
        continue

    cds_coords = gb.split("/coded_by=")[1].split("\n")[0]
    if "(" in cds_coords:
        cds_coords = cds_coords.split("(")[-1].split(")")[0]
    seqid, coords = cds_coords.split(":")[0], cds_coords.split(":")[1]
    start, stop = coords.split("..")
    start = int("".join(filter(type(start).isdigit, start)))
    stop = int("".join(filter(type(stop).isdigit, stop)))
    seqid = "".join(c for c in seqid if c.isalnum() or c in "_.")
    if start > stop:
        start, stop = stop, start
    start -= 1
    print("Fetching CDS:", seqid, "for", subject_id)
    with Entrez.efetch(
        db="nucleotide", id=seqid, rettype="fasta", retmode="text"
    ) as handle:
        nn_rec = list(SeqIO.parse(handle, "fasta"))[0]
    nn_rec.seq = nn_rec.seq[start:stop]
    if not nn_rec.seq:
        print("No sequence found for", seqid)
        continue

    if query_id not in homologs:
        homologs[query_id] = [aa_rec]
    else:
        homologs[query_id] += [aa_rec]

    if query_id not in guides:
        guides[query_id] = [nn_rec]
    else:
        guides[query_id] += [nn_rec]

Fetching O13514.1
No CDS region found for O13514.1
Fetching KZV13421.1
Fetching CDS: LBMA01000001.1 for KZV13421.1
Fetching NP_009654.3
Fetching CDS: NM_001178444.3 for NP_009654.3
Fetching CAI4272395.1
Fetching CDS: CANCRS010000002.1 for CAI4272395.1
Fetching AJQ01256.1
Fetching CDS: CP004619.1 for AJQ01256.1
Fetching AJP87446.1
Fetching CDS: CP004582.2 for AJP87446.1
Fetching AJQ13484.1
Fetching CDS: CP004651.2 for AJQ13484.1
Fetching AJQ15779.1
Fetching CDS: CP004657.2 for AJQ15779.1
Fetching CAI7146735.1
Fetching CDS: CASBOU010000002.1 for CAI7146735.1
Fetching AJP98938.1
Fetching CDS: CP004613.2 for AJP98938.1
Fetching CAI4258547.1
Fetching CDS: CANBTS010000002.1 for CAI4258547.1
Fetching CAI4272761.1
Fetching CDS: CANCBQ010000002.1 for CAI4272761.1
Fetching NP_009846.1
Fetching CDS: NM_001178635.1 for NP_009846.1
Fetching AAT92609.1
Fetching CDS: AY692590.1 for AAT92609.1
Fetching AJP37359.1
Fetching CDS: CP004100.2 for AJP37359.1
Fetching CAI4293572.1
Fetching CDS: CANCGP0100000

In [8]:
for query_id in homologs:
    try:
        with open(f"../output/structures/mRNA/{query_id}/seqdump_aa.txt", "w") as f:
            SeqIO.write(homologs[query_id], f, "fasta")
    except FileNotFoundError:
        continue
for query_id in guides:
    try:
        with open(f"../output/structures/mRNA/{query_id}/pal2nal_guides.txt", "w") as f:
            SeqIO.write(guides[query_id], f, "fasta")
    except FileNotFoundError:
        continue