In [110]:
from bioservices import EUtils
import pandas as pd 
import urllib
import time 
import numpy as np 

s = EUtils(email="finn.heydemann@th-koeln.de")

url = "https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/latest/ReferenceGeneCatalog.txt"
df = pd.read_csv(urllib.request.urlopen(url), delimiter="\t")

In [111]:
def fetch_refseq_nucleotid(refseq_nucleotid_id: str) -> tuple[str, str, str]: 
    """
    Based on a ncbi refseq nucleotid ID the encoded protein, gene and bacterial strain is returned 
    """
    data = s.EFetch("nucleotide", refseq_nucleotid_id, retmode="dict", rettype="summary")["GBSet"]["GBSeq"]
    time.sleep(1) # wait to ensure to not be blocked
    gene, protein = data["GBSeq_definition"].split("gene for")
    protein = protein.replace(", complete CDS", "")
    strain = data["GBSeq_organism"]
    return protein.strip(), gene.strip(), strain.strip()


In [112]:

def fetch_refseq_protein(refseq_protein_id: str) -> tuple[str]: 
    """
    Based on the ncbi refseq protein ID the parent taxon, the protein name is returend
    If needed the same parent taxon is returned based on another entry
    """
    data = s.EFetch("protein", refseq_protein_id, retmode="dict", rettype="summary")["GBSet"]["GBSeq"]
    time.sleep(1) # wait to ensure to not be blocked
    parent_taxon = data["GBSeq_organism"].strip()
    definition = data["GBSeq_definition"]
    start_index, stop_index = (n:=definition.find("[")), definition.find("]", n)
    same_parent_taxon = definition[start_index+1: stop_index].strip()
    protein = (definition[:start_index] + definition[stop_index+1:])
    protein = protein.replace("MULTISPECIES: ", "").strip()
    return parent_taxon, protein, same_parent_taxon
    

In [113]:
fetch_refseq_protein(df.loc[1].refseq_protein_accession)

('Pseudomonadota',
 "kasugamycin N-acetyltransferase AAC(2')-IIa",
 'Pseudomonadota')

In [114]:
fetch_refseq_nucleotid(df.loc[1].refseq_nucleotide_accession)

("kasugamycin N-acetyltransferase AAC(2')-IIa",
 "Burkholderia glumae 5091 aac(2')-IIa",
 'Burkholderia glumae')

In [115]:

def get_protein_and_parent(df_row: pd.Series) -> pd.Series:
    """
    To a pandas dataframe row this function return a pd Series with the bacteria parent taxon and the protein it encodes
    """
    parent_taxon, protein, *_ = fetch_refseq_protein(df_row.refseq_protein_accession)
    return pd.Series([parent_taxon, protein])


sub_df = df[["product_name", "refseq_protein_accession", "refseq_nucleotide_accession"]].loc[:10]
sub_df[["_parent_taxon", "_protein"]] = sub_df.apply(get_protein_and_parent, axis=1)
sub_df

Unnamed: 0,product_name,refseq_protein_accession,refseq_nucleotide_accession,_parent_taxon,_protein
0,aminoglycoside N-acetyltransferase AAC(2')-I(A...,WP_025297907.1,NG_242157.1,Pseudomonas aeruginosa,aminoglycoside N-acetyltransferase AAC(2')-I(A...
1,kasugamycin N-acetyltransferase AAC(2')-IIa,WP_063839881.1,NG_047225.1,Pseudomonadota,kasugamycin N-acetyltransferase AAC(2')-IIa
2,kasugamycin N-acetyltransferase AAC(2')-IIb,WP_071224044.1,NG_055672.1,Paenibacillus sp. LC231,kasugamycin N-acetyltransferase AAC(2')-IIb
3,aminoglycoside N-acetyltransferase AAC(2')-Ia,WP_004918308.1,NG_047226.1,Providencia,aminoglycoside N-acetyltransferase AAC(2')-Ia
4,aminoglycoside N-acetyltransferase AAC(2')-Ib,WP_003881640.1,NG_047227.1,Mycolicibacterium fortuitum,aminoglycoside N-acetyltransferase AAC(2')-Ib
5,aminoglycoside N-acetyltransferase AAC(2')-Ic,WP_003899880.1,NG_047229.1,Mycobacterium,aminoglycoside N-acetyltransferase AAC(2')-Ic
6,aminoglycoside N-acetyltransferase AAC(2')-Id,WP_011726942.1,NG_047230.1,Mycolicibacterium smegmatis,aminoglycoside N-acetyltransferase AAC(2')-Id
7,aminoglycoside N-acetyltransferase AAC(2')-Ie,WP_010908954.1,NG_050581.1,Mycobacterium leprae,aminoglycoside N-acetyltransferase AAC(2')-Ie
8,aminoglycoside N-acetyltransferase AAC(3)-C1264,WP_034000523.1,NG_242158.1,Pseudomonas aeruginosa,aminoglycoside N-acetyltransferase AAC(3)-C1264
9,aminoglycoside N-acetyltransferase AAC(3)-C322,WP_044061792.1,NG_242159.1,Pseudomonadota,aminoglycoside N-acetyltransferase AAC(3)-C322


In [116]:
def get_strain_and_gene(df_row: pd.Series) -> pd.Series: 
    _, gene, strain = fetch_refseq_nucleotid(df_row.refseq_nucleotide_accession)
    return pd.Series([gene, strain])


sub_df[["_gene", "_strain"]] = sub_df.apply(get_strain_and_gene, axis=1)

In [117]:
sub_df.drop(["refseq_protein_accession", "refseq_nucleotide_accession"], axis=1)

Unnamed: 0,product_name,_parent_taxon,_protein,_gene,_strain
0,aminoglycoside N-acetyltransferase AAC(2')-I(A...,Pseudomonas aeruginosa,aminoglycoside N-acetyltransferase AAC(2')-I(A...,Pseudomonas aeruginosa PA38182 aac(2')-I(A267),Pseudomonas aeruginosa PA38182
1,kasugamycin N-acetyltransferase AAC(2')-IIa,Pseudomonadota,kasugamycin N-acetyltransferase AAC(2')-IIa,Burkholderia glumae 5091 aac(2')-IIa,Burkholderia glumae
2,kasugamycin N-acetyltransferase AAC(2')-IIb,Paenibacillus sp. LC231,kasugamycin N-acetyltransferase AAC(2')-IIb,Paenibacillus sp. LC231 aac(2')-IIb,Paenibacillus sp. LC231
3,aminoglycoside N-acetyltransferase AAC(2')-Ia,Providencia,aminoglycoside N-acetyltransferase AAC(2')-Ia,Providencia stuartii aac(2')-Ia,Providencia stuartii
4,aminoglycoside N-acetyltransferase AAC(2')-Ib,Mycolicibacterium fortuitum,aminoglycoside N-acetyltransferase AAC(2')-Ib,Mycobacterium fortuitum FC1K aac(2')-Ib,Mycolicibacterium fortuitum
5,aminoglycoside N-acetyltransferase AAC(2')-Ic,Mycobacterium,aminoglycoside N-acetyltransferase AAC(2')-Ic,Mycobacterium tuberculosis H37Rv aac(2')-Ic,Mycobacterium tuberculosis H37Rv
6,aminoglycoside N-acetyltransferase AAC(2')-Id,Mycolicibacterium smegmatis,aminoglycoside N-acetyltransferase AAC(2')-Id,Mycobacterium smegmatis str. MC2 155 mc2155 aa...,Mycolicibacterium smegmatis MC2 155
7,aminoglycoside N-acetyltransferase AAC(2')-Ie,Mycobacterium leprae,aminoglycoside N-acetyltransferase AAC(2')-Ie,Mycobacterium leprae Br4923 aac(2')-Ie,Mycobacterium leprae Br4923
8,aminoglycoside N-acetyltransferase AAC(3)-C1264,Pseudomonas aeruginosa,aminoglycoside N-acetyltransferase AAC(3)-C1264,Pseudomonas aeruginosa aac(3)-C1264,Pseudomonas aeruginosa
9,aminoglycoside N-acetyltransferase AAC(3)-C322,Pseudomonadota,aminoglycoside N-acetyltransferase AAC(3)-C322,Salmonella enterica subsp. enterica serovar Ty...,Salmonella enterica subsp. enterica serovar Ty...


In [118]:
def fetch_genbank_protein(genbank_protein: str) -> tuple[str, str]: 
    data = s.EFetch("protein", genbank_protein, retmode="dict", rettype="summary")["GBSet"]["GBSeq"]
    time.sleep(1)
    strain, organism = "", ""
    for t in data["GBSeq_feature-table"]["GBFeature"]: 
        if t["GBFeature_key"] == "source": 
            for x in t["GBFeature_quals"]["GBQualifier"]: 
                if x["GBQualifier_name"] == "organism": 
                    organism = x["GBQualifier_value"]
                if x["GBQualifier_name"] == "strain": 
                    strain = x["GBQualifier_value"]
    return organism, strain
    

fetch_genbank_protein("AAB17563.1")

('Mycobacterium tuberculosis H37Rv', 'H37Rv')

In [119]:
def get_organism_strain(df_row: pd.Series) -> pd.Series: 
    return pd.Series(fetch_genbank_protein(df_row.genbank_protein_accession))

sub_df = df[["product_name", "genbank_protein_accession"]].loc[:10]
sub_df[["_organism", "_strain"]] = sub_df.apply(get_organism_strain, axis=1)
sub_df

Unnamed: 0,product_name,genbank_protein_accession,_organism,_strain
0,aminoglycoside N-acetyltransferase AAC(2')-I(A...,CDI94966.1,Pseudomonas aeruginosa PA38182,PA38182
1,kasugamycin N-acetyltransferase AAC(2')-IIa,BAM16262.1,Burkholderia glumae,5091
2,kasugamycin N-acetyltransferase AAC(2')-IIb,APB03221.1,Paenibacillus sp. LC231,LC231
3,aminoglycoside N-acetyltransferase AAC(2')-Ia,AAA03550.1,Providencia stuartii,
4,aminoglycoside N-acetyltransferase AAC(2')-Ib,AAC44793.1,Mycolicibacterium fortuitum,FC1K
5,aminoglycoside N-acetyltransferase AAC(2')-Ic,AAB17563.1,Mycobacterium tuberculosis H37Rv,H37Rv
6,aminoglycoside N-acetyltransferase AAC(2')-Id,AAB41701.1,Mycolicibacterium smegmatis MC2 155,MC2 155
7,aminoglycoside N-acetyltransferase AAC(2')-Ie,CAR72650.1,Mycobacterium leprae Br4923,
8,aminoglycoside N-acetyltransferase AAC(3)-C1264,EKW6221576.1,Pseudomonas aeruginosa,
9,aminoglycoside N-acetyltransferase AAC(3)-C322,ECI2662265.1,Salmonella enterica subsp. enterica serovar Ty...,PNUSAS018916


In [123]:
# drop rows if _strain is empty and _organism has no numeric value in it -- expert knowledge required -- database shit 
dummy = dummy[np.logical_or(dummy["_strain"].astype(bool), dummy["_organism"].apply(lambda x: any(char.isdigit() for char in x)))]
dummy

Unnamed: 0,product_name,genbank_protein_accession,_organism,_strain,strain
0,aminoglycoside N-acetyltransferase AAC(2')-I(A...,CDI94966.1,Pseudomonas aeruginosa PA38182,PA38182,Pseudomonas aeruginosa PA38182
1,kasugamycin N-acetyltransferase AAC(2')-IIa,BAM16262.1,Burkholderia glumae,5091,Burkholderia glumae 5091
2,kasugamycin N-acetyltransferase AAC(2')-IIb,APB03221.1,Paenibacillus sp. LC231,LC231,Paenibacillus sp. LC231
4,aminoglycoside N-acetyltransferase AAC(2')-Ib,AAC44793.1,Mycolicibacterium fortuitum,FC1K,Mycolicibacterium fortuitum FC1K
5,aminoglycoside N-acetyltransferase AAC(2')-Ic,AAB17563.1,Mycobacterium tuberculosis H37Rv,H37Rv,Mycobacterium tuberculosis H37Rv
6,aminoglycoside N-acetyltransferase AAC(2')-Id,AAB41701.1,Mycolicibacterium smegmatis MC2 155,MC2 155,Mycolicibacterium smegmatis MC2 155
7,aminoglycoside N-acetyltransferase AAC(2')-Ie,CAR72650.1,Mycobacterium leprae Br4923,,Mycobacterium leprae Br4923
9,aminoglycoside N-acetyltransferase AAC(3)-C322,ECI2662265.1,Salmonella enterica subsp. enterica serovar Ty...,PNUSAS018916,Salmonella enterica subsp. enterica serovar Ty...


In [125]:
# combine _organism and _strain to strain -- this is probably the same name as the chromosome/genome -- expert knowledge required
dummy["strain"] = [f"{x[1]} {x[0]}" if not x[0] in x[1] else x[1] for x in zip(dummy["_strain"], dummy["_organism"])]
dummy.drop(["_strain", "_organism"], axis=1, inplace=True)
dummy

Unnamed: 0,product_name,genbank_protein_accession,strain
0,aminoglycoside N-acetyltransferase AAC(2')-I(A...,CDI94966.1,Pseudomonas aeruginosa PA38182
1,kasugamycin N-acetyltransferase AAC(2')-IIa,BAM16262.1,Burkholderia glumae 5091
2,kasugamycin N-acetyltransferase AAC(2')-IIb,APB03221.1,Paenibacillus sp. LC231
4,aminoglycoside N-acetyltransferase AAC(2')-Ib,AAC44793.1,Mycolicibacterium fortuitum FC1K
5,aminoglycoside N-acetyltransferase AAC(2')-Ic,AAB17563.1,Mycobacterium tuberculosis H37Rv
6,aminoglycoside N-acetyltransferase AAC(2')-Id,AAB41701.1,Mycolicibacterium smegmatis MC2 155
7,aminoglycoside N-acetyltransferase AAC(2')-Ie,CAR72650.1,Mycobacterium leprae Br4923
9,aminoglycoside N-acetyltransferase AAC(3)-C322,ECI2662265.1,Salmonella enterica subsp. enterica serovar Ty...


In [None]:
"""
SELECT ?item ?itemLabel ?itemDescription
WHERE {
  ?item rdfs:label ?label;
        schema:description "species of bacterium"@en.
  
  FILTER(LANG(?label) = "en" && CONTAINS(LCASE(?label), "paenibacillus sp"))
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
LIMIT 10
"""


In [130]:
from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

In [133]:
# From https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/queries/examples#Cats
sparql.setQuery("""
SELECT ?item ?itemLabel ?itemDescription
WHERE {
  ?item rdfs:label ?label;
        schema:description "species of bacterium"@en.
  
  FILTER(LANG(?label) = "en" && CONTAINS(LCASE(?label), "paenibacillus sp"))
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
LIMIT 10
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

In [134]:
results

{'head': {'vars': ['item', 'itemLabel', 'itemDescription']},
 'results': {'bindings': [{'item': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q26270468'},
    'itemLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'Paenibacillus sputi'},
    'itemDescription': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'species of bacterium'}}]}}