In [1]:
import os

import pandas as pd

from Bio import SeqIO, Align

In [2]:
def get_seqs(filepath: str) -> pd.DataFrame:

    seqs = []

    with open(filepath, mode="r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            seqs.append(
                pd.Series({
                    "seq_id": record.id,
                    "seq": "".join(record.seq)
                }).to_frame().T
            )

    return pd.concat(seqs)

In [3]:
DATA_DIR = "../data/"

In [4]:
mediadive_df = pd.read_csv(
    os.path.join(
        DATA_DIR,
        "media2ec-final.csv"
    )
)

mediadive_df = mediadive_df\
    .drop("Unnamed: 0", axis=1)\
    .dropna(subset="taxon_id")
mediadive_df = mediadive_df[~mediadive_df["taxon_id"].str.startswith("[{")]

print(
    "[+] Number of taxon IDs in MediaDive:",
    mediadive_df["taxon_id"].nunique()
)

[+] Number of taxon IDs in MediaDive: 7717


In [5]:
silva_taxmap = pd.read_table(
    os.path.join(
        DATA_DIR,
        "silva",
        "taxmap_embl-ebi_ena_ssu_ref_138.2.txt"
    )
)

# Discard eukaryotes
silva_taxmap = silva_taxmap[
    ~silva_taxmap["submitted_path"].str.startswith("Eukaryota;")
]

# Create sequence ID for matching with FASTA file
silva_taxmap["seq_id"] = \
    silva_taxmap["primaryAccession"] + "." + \
    silva_taxmap["start"].astype(str) + "." + \
    silva_taxmap["stop"].astype(str)

silva_taxmap

Unnamed: 0,primaryAccession,start,stop,submitted_path,submitted_name,ncbi_taxonid,seq_id
7,AB000106,1,1343,Bacteria;Proteobacteria;Alphaproteobacteria;Sp...,Sphingomonas sp.,28214,AB000106.1.1343
8,AB000278,1,1410,Bacteria;Proteobacteria;Gammaproteobacteria;Vi...,Photobacterium iliopiscarium,56192,AB000278.1.1410
9,AB000389,1,1508,Bacteria;Proteobacteria;Gammaproteobacteria;Al...,Pseudoalteromonas elyakovii,81037,AB000389.1.1508
13,AB000390,1,1428,Bacteria;Proteobacteria;Gammaproteobacteria;Vi...,Vibrio halioticoli,71388,AB000390.1.1428
14,BH771024,532,2058,Bacteria;Firmicutes;Bacilli;Lactobacillales;St...,Lactococcus lactis subsp. cremoris MG1363,416870,BH771024.532.2058
...,...,...,...,...,...,...,...
2224685,CRSC01000143,4139,5672,Bacteria;Firmicutes;Bacilli;Lactobacillales;St...,Streptococcus pneumoniae,1313,CRSC01000143.4139.5672
2224686,CRVD01000010,52257,53790,Bacteria;Firmicutes;Bacilli;Lactobacillales;St...,Streptococcus pneumoniae,1313,CRVD01000010.52257.53790
2224687,CRLF01000008,52871,54404,Bacteria;Firmicutes;Bacilli;Lactobacillales;St...,Streptococcus pneumoniae,1313,CRLF01000008.52871.54404
2224688,LT558790,1,1250,Bacteria;Tenericutes;Mollicutes;Acholeplasmata...,Candidatus Phytoplasma balanitae,1130089,LT558790.1.1250


In [6]:
silva_seqs = get_seqs(
    os.path.join(
        DATA_DIR,
        "silva",
        "SILVA_138.2_SSURef_NR99_tax_silva.fasta"
    )
)
silva_seqs

Unnamed: 0,seq_id,seq
0,AY846379.1.1791,AACCUGGUUGAUCCUGCCAGUAGUCAUAUGCUUGUCUCAAAGAUUA...
0,AY846382.1.1778,GUUGAUCCUGCCAGUAGUCAUAUGCUUGUCUCAAAGAUUAAGCCAU...
0,AB000393.1.1510,UGGCUCAGAUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUCGA...
0,AY909590.1.2352,GACUAAGCCAUGCAUGUCUAAGUAUAAACGCGUUUAUACAUGUGAA...
0,AB000480.1.1326,AGUUUGAUCCUGGCUCAGAACAACGCUGGCGGCAGGCCUAACACAU...
...,...,...
0,CXWL01005852.55988.57516,AGAGUUUGAUCCUGGCUCAGAACGAACGCUGGCGGCGUGGCUAAGG...
0,CXWL01067720.5231.6709,AGAGUUUGAUCCUAGCUCAGGAUGAACGCUAGCGGUAUGCUUAACA...
0,MVDZ01000001.237895.239457,UAUUUUAAAGAGAGUUUGAUCCUGGCUCAGGACGAACGCUGGCGGC...
0,JQDZ01000023.192.1724,AGAGUUUGAUCAUGGCUCAGAUUGAACGCUGGCGGCAGGCUUAACA...


## Merge all datasets

In [7]:
silva_taxmap = silva_taxmap.rename(columns={"ncbi_taxonid": "taxon_id"})
silva_taxmap["taxon_id"] = silva_taxmap["taxon_id"].astype(str)
mediadive_df["taxon_id"] = mediadive_df["taxon_id"].astype(str)

merged_df = pd.merge(
    left=mediadive_df,
    right=silva_taxmap,
    on="taxon_id",
    how="left"
)

len(merged_df), len(mediadive_df["taxon_id"].unique())

(3296827, 7717)

In [8]:
merged_df = pd.merge(
    left=merged_df,
    right=silva_seqs,
    on="seq_id",
    how="left"
)

# Drop IDs without sequence
merged_df = merged_df.dropna(subset=["seq"])
merged_df

Unnamed: 0,media_id,taxon_id,component_id,KEGG cpd,Enzyme,primaryAccession,start,stop,submitted_path,submitted_name,seq_id,seq
2,1a,303,4.0,C00001,1.1.1.1 1.1.1.22 1.1.1.23 ...,KU229983,1.0,1456.0,Bacteria;Proteobacteria;Gammaproteobacteria;Ps...,Pseudomonas putida,KU229983.1.1456,GAGAGUUUGAAUCCUGGCUCAGAUUGAACGCUGGCGGCAGGCCUAA...
3,1a,303,4.0,C00001,1.1.1.1 1.1.1.22 1.1.1.23 ...,JN197274,1.0,1466.0,Bacteria;Proteobacteria;Gammaproteobacteria;Ps...,Pseudomonas putida,JN197274.1.1466,CCGNNNCCGGCGGGNCUACACAUGNAGUCGAGCGGAUGACGGGAGC...
4,1a,303,4.0,C00001,1.1.1.1 1.1.1.22 1.1.1.23 ...,KY643484,1.0,1264.0,Bacteria;Proteobacteria;Gammaproteobacteria;Ps...,Pseudomonas putida,KY643484.1.1264,CGUCCUCGGGAGAAAGCAGGGGACCUUCGGGCCUUGCGCUAUCAGA...
5,1a,303,4.0,C00001,1.1.1.1 1.1.1.22 1.1.1.23 ...,JN222977,1.0,1454.0,Bacteria;Proteobacteria;Gammaproteobacteria;Ps...,Pseudomonas putida,JN222977.1.1454,GGCGAAAGGGGGCAGCUACCAUGCAGUCGAGCGGAUGACGGGAGCU...
22,1a,303,4.0,C00001,1.1.1.1 1.1.1.22 1.1.1.23 ...,AJ007910,1.0,1437.0,Bacteria;Proteobacteria;Gammaproteobacteria;Ps...,Pseudomonas putida,AJ007910.1.1437,UUGACGCUGGCGGCAGGCUAACACUGCAAGUCGAGCGGAUGAGAAG...
...,...,...,...,...,...,...,...,...,...,...,...,...
3296754,,487,,,,JRET01000004,1608177.0,1609697.0,Bacteria;Proteobacteria;Betaproteobacteria;Nei...,Neisseria meningitidis,JRET01000004.1608177.1609697,CAUGCUUUACACAUGCAAGUCGGACGGCAGCACAGAGAAGCUUGCU...
3296758,,487,,,,JRET01000009,293218.0,294744.0,Bacteria;Proteobacteria;Betaproteobacteria;Nei...,Neisseria meningitidis,JRET01000009.293218.294744,CAUGCUUUACACAUGCAAGUCGGACGGCAGCACAGAGAAGCUUGCU...
3296760,,487,,,,JRFG01000002,152764.0,154301.0,Bacteria;Proteobacteria;Betaproteobacteria;Nei...,Neisseria meningitidis,JRFG01000002.152764.154301,AGAGAUUGAACAUAAGAGUUUGAUCCUGGCUCAGAUUGAACGCUGC...
3296765,,487,,,,JRFG01000011,1052166.0,1053721.0,Bacteria;Proteobacteria;Betaproteobacteria;Nei...,Neisseria meningitidis,JRFG01000011.1052166.1053721,AGAGAUUGAACAUAAGAGUUUGAUCCUGGCUCAGAUUGAACGCUGG...


## Sequence similarity

In [9]:
aligner = Align.PairwiseAligner(
    mode="global",
    scoring="blastn",
    match_score=2,
    mismatch_score=-1,
    # open_gap_score=-7,
    # extend_gap_score=-2
)
print(aligner)

Pairwise sequence aligner with parameters
  wildcard: None
  match_score: 2.000000
  mismatch_score: -1.000000
  target_internal_open_gap_score: -7.000000
  target_internal_extend_gap_score: -2.000000
  target_left_open_gap_score: -7.000000
  target_left_extend_gap_score: -2.000000
  target_right_open_gap_score: -7.000000
  target_right_extend_gap_score: -2.000000
  query_internal_open_gap_score: -7.000000
  query_internal_extend_gap_score: -2.000000
  query_left_open_gap_score: -7.000000
  query_left_extend_gap_score: -2.000000
  query_right_open_gap_score: -7.000000
  query_right_extend_gap_score: -2.000000
  mode: global



In [10]:
query = merged_df["seq"].iloc[0]
query

'GAGAGUUUGAAUCCUGGCUCAGAUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUCGAGCGGUAGACAGGAGCUUGCUUCUUGAGUGAGCGGCGGACGGGUGAGUAAUGCCUAGGAAUCUGCCUGGUAGUGGGGGACAACGUUUCGAAAGGAACGCUAAUACCGCAUACGUCCUACGGGAGAAAGCAGGGGACCUUCGGGCCUUGCGCUAUCAGAUGAGCCUAGGUCGGAUUAGCUAGUUGGUGGGGUAAUGGCUCACCAAGGCGACGAUCCGUAACUGGUCUGAGAGGAUGAUCAGUCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGGACAAUGGGCGAAAGCCUGAUCCAGCCAUGCCGCGUGUGUGAAGAAGGUCUUCGGAUUGUAAAGCACUUUAAGUUGGGAGGAAGGGCAGUUAGUUAAUACCUGAUUGUUUUGACGUUACCGACAGAAUAAGCACCGGCUAACUCUGUGCCAGCAGCCGCGGUAAUACAGAGGGUGCAAGCGUUAAUCGGAAUUACUGGGCGUAAAGCGCGCGUAGGUGGUUUGUUAAGUUGGAUGUGAAAUCCCCGGGCUCAACCUGGGAACUGCAUCCAAAACUGGCAAGCUAGAGUACGGUAGAGGGUGGUGGAAUUUCCUGUGUAGCGGUGAAAUGCGUAGAUAUAGGAAGGAACACCAGUGGCGAAGGCGACCACCUGGACUGAUACUGACACUGAGGUGCGAAAGCGUGGGGAGCAAACAGGAUUAGAUACCCUGGUAGUCCACGCCGUAAACGAUGUCAACUAGCCGUUGGGAUCCUUGAGAUUUUAGUGGCGCAGCUAACGCAUUAAGUUGACCGCCUGGGGAGUACGGCCGCAAGGUUAAAACUCAAAUGAAUUGACGGGGGCCCGCACAAGCGGUGGAGCAUGUGGUUUAAUUCGAAGCAACGCGAAGAACCUUACCAGGCCUUGACAUGCAGAGAACUUUC

In [23]:
query_df = []

for _, row in merged_df.iterrows():
    query_df.append(
        pd.Series({
            "query_id": "query_id",
            "target_id": row["seq_id"],
            "score": aligner.score(query, row["seq"])
        }).to_frame().T
    )

query_df = pd.concat(query_df).sort_values("score", ascending=False)

query_df = pd.merge(
    left=query_df,
    right=merged_df.drop_duplicates(subset=["seq_id", "taxon_id", "media_id"]),
    left_on="target_id",
    right_on="seq_id",
    how="inner"
)
query_df = query_df[[
    "query_id",
    "target_id",
    "score",
    "taxon_id",
    "media_id"
]]

query_df

Unnamed: 0,query_id,target_id,score,taxon_id,media_id
0,query_id,KU229983.1.1456,2912.0,303,1a
1,query_id,KU229983.1.1456,2912.0,303,J12
2,query_id,KU229983.1.1456,2912.0,303,J75
3,query_id,KU229983.1.1456,2912.0,303,
4,query_id,KU229983.1.1456,2912.0,303,1a
...,...,...,...,...,...
3995,query_id,MG675631.1.1234,1056.0,303,
3996,query_id,MG675631.1.1234,1056.0,303,1a
3997,query_id,MG675631.1.1234,1056.0,303,J12
3998,query_id,MG675631.1.1234,1056.0,303,J75


In [40]:
# TODO: to fasten the loop, just store the best score
# TODO: to fasten the loop, check how to remove duplicates

silva_taxmap[silva_taxmap["taxon_id"] == "303"]

Unnamed: 0,primaryAccession,start,stop,submitted_path,submitted_name,taxon_id,seq_id
1246,JN048648,1,1375,Bacteria;Proteobacteria;Gammaproteobacteria;Ps...,Pseudomonas putida,303,JN048648.1.1375
3535,KU229978,1,1451,Bacteria;Proteobacteria;Gammaproteobacteria;Ps...,Pseudomonas putida,303,KU229978.1.1451
3610,KU229983,1,1456,Bacteria;Proteobacteria;Gammaproteobacteria;Ps...,Pseudomonas putida,303,KU229983.1.1456
10924,JN197274,1,1466,Bacteria;Proteobacteria;Gammaproteobacteria;Ps...,Pseudomonas putida,303,JN197274.1.1466
10983,KY643484,1,1264,Bacteria;Proteobacteria;Gammaproteobacteria;Ps...,Pseudomonas putida,303,KY643484.1.1264
...,...,...,...,...,...,...,...
2206571,FMHQ01000006,1,1401,Bacteria;Proteobacteria;Gammaproteobacteria;Ps...,Pseudomonas putida,303,FMHQ01000006.1.1401
2206573,FMHQ01000008,653,2169,Bacteria;Proteobacteria;Gammaproteobacteria;Ps...,Pseudomonas putida,303,FMHQ01000008.653.2169
2216538,LKKS01000092,42,1428,Bacteria;Proteobacteria;Gammaproteobacteria;Ps...,Pseudomonas putida,303,LKKS01000092.42.1428
2216540,LKGZ01000002,720,2557,Bacteria;Proteobacteria;Gammaproteobacteria;Ps...,Pseudomonas putida,303,LKGZ01000002.720.2557
