In [1]:
import pandas as pd 
import urllib
import time 
import numpy as np 
import bioservice_fetcher as biof 
import os 

## Fetch data from NCBI database 

The read dataframe contains keys to access the protein, nucleotide either via Reference Sequence (RefSeq) database or genbank. Lets load the first two columns of the dataframe and read some values. First try is to get the data from the RefSeq-database.  

In [2]:
# Reads everything that could possible be interesting for this project

def fetch_data(read_from_web: bool = False) -> pd.DataFrame: 
    """
    if fetch_data_switch is True data will be fetched from NCBI and interesting things will be read from genbank or refseq database -- caution: Takes for ages 
    else data will be read from last time -- should be used in most of the cases 
    """
    if read_from_web: 
        url = "https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/latest/ReferenceGeneCatalog.txt"
        df = pd.read_csv(urllib.request.urlopen(url), delimiter="\t")
        df = df.sample(10)
        df.apply(biof.get_protein_and_parent, axis=1)
        df.apply(biof.get_strain_and_gene, axis=1)
        df.apply(biof.get_organism_strain_via_prot,axis=1)
        df.apply(biof.get_organism_strain_via_nuc, axis=1)
        df.to_csv("resistance_df.csv", index=False)
    else: 
        if not os.path.exists("resistance_df.csv"): 
            print("Cannot read from hard drive because file does not exist -- set read from web switch to True")
            return None
        df = pd.read_csv("resistance_df.csv")
    return df


df = fetch_data()
df.sample(5, random_state=19)


Unnamed: 0,allele,gene_family,whitelisted_taxa,product_name,scope,type,subtype,class,subclass,refseq_protein_accession,...,refseq_protein,refseq_genome,refseq_organism,refseq_parent,rrefseq_protein,refseq_same_parent,genbank_organsim_nuc,genbank_strain_nuc,genbank_organsim_prot,genbank_strain_prot
4666,blaPLA-4,blaPLA,,class A beta-lactamase PLA-4,core,AMR,AMR,BETA-LACTAM,BETA-LACTAM,WP_032687534.1,...,class A beta-lactamase PLA-4,,Raoultella planticola,Raoultella planticola,class A beta-lactamase PLA-4,Raoultella planticola,Raoultella planticola,42-1,Raoultella planticola,42-1
5397,catB6,catB,,type B-3 chloramphenicol O-acetyltransferase C...,core,AMR,AMR,PHENICOL,CHLORAMPHENICOL,WP_063843229.1,...,type B-3 chloramphenicol O-acetyltransferase C...,,Pseudomonas aeruginosa,Gammaproteobacteria,type B-3 chloramphenicol O-acetyltransferase C...,Gammaproteobacteria,Pseudomonas aeruginosa,,Pseudomonas aeruginosa,
8260,fusA_G452S,fusA,Staphylococcus_aureus,elongation factor G,core,AMR,POINT,FUSIDIC ACID,FUSIDIC ACID,WP_000090315.1,...,,,,Staphylococcus,elongation factor G,Staphylococcus,Staphylococcus aureus subsp. aureus Mu50,Mu50,Staphylococcus aureus subsp. aureus Mu50,Mu50
1990,blaFLC-1,blaFRI,,FRI family carbapenem-hydrolyzing class A beta...,core,AMR,AMR,BETA-LACTAM,CARBAPENEM,WP_123061077.1,...,FRI family carbapenem-hydrolyzing class A beta...,,Enterobacter cloacae,Enterobacter cloacae,FRI family carbapenem-hydrolyzing class A beta...,Enterobacter cloacae,Enterobacter cloacae,FRI-3442,Enterobacter cloacae,FRI-3442
7825,,vanXY-N,,D-Ala-D-Ala dipeptidase/D-Ala-D-Ala carboxypep...,core,AMR,AMR,GLYCOPEPTIDE,VANCOMYCIN,WP_063856820.1,...,D-Ala-D-Ala dipeptidase/D-Ala-D-Ala carboxypep...,,Enterococcus faecium,Enterococcus,D-Ala-D-Ala dipeptidase/D-Ala-D-Ala carboxypep...,Enterococcus,Enterococcus faecium,UCN71,Enterococcus faecium,UCN71


## STRAIN

Because there is quite a bit of data present, my goal is to extract the right name of the becterial strain. Therefore I am going to find the best datasource of organsim name and combine it with the best source of the exact strain name

In [9]:
df.keys()

Index(['allele', 'gene_family', 'whitelisted_taxa', 'product_name', 'scope',
       'type', 'subtype', 'class', 'subclass', 'refseq_protein_accession',
       'refseq_nucleotide_accession', 'curated_refseq_start',
       'genbank_protein_accession', 'genbank_nucleotide_accession',
       'genbank_strand', 'genbank_start', 'genbank_stop', 'refseq_strand',
       'refseq_start', 'refseq_stop', 'pubmed_reference', 'blacklisted_taxa',
       'synonyms', 'hierarchy_node', 'db_version', 'refseq_gene',
       'refseq_protein', 'refseq_genome', 'refseq_organism', 'refseq_parent',
       'rrefseq_protein', 'refseq_same_parent', 'genbank_organsim_nuc',
       'genbank_strain_nuc', 'genbank_organsim_prot', 'genbank_strain_prot'],
      dtype='object')

In [16]:
# Just by looking into this small random sampled dataframe "refseq organism", "genbank_organsim_nuc" and "genbank_organism_prot" yield pretty similar results, 
# altough "refseq_organism" has less information
# "refseq_parent" and "refseq_same_parent" sometimes carry the same organism name, but sometimes some higher taxon
# For extracting the right organism name I am going to look closer into "refseq_organism", "genbank_organism_nuc" and "ganbank_organism_prot"
df[["refseq_organism", "refseq_parent", "refseq_same_parent", "genbank_organsim_nuc", "genbank_organsim_prot"]].sample(10, random_state=10)

Unnamed: 0,refseq_organism,refseq_parent,refseq_same_parent,genbank_organsim_nuc,genbank_organsim_prot
389,Simplicispira metamorpha,Simplicispira metamorpha,Simplicispira metamorpha,Simplicispira metamorpha,Simplicispira metamorpha
2293,Klebsiella pneumoniae subsp. pneumoniae,Klebsiella pneumoniae,Klebsiella pneumoniae,Klebsiella pneumoniae subsp. pneumoniae,Klebsiella pneumoniae subsp. pneumoniae
6959,Escherichia coli,Gammaproteobacteria,Gammaproteobacteria,Escherichia coli,Escherichia coli
8684,,Klebsiella,Klebsiella,Klebsiella pneumoniae,Klebsiella pneumoniae
4144,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Pseudomonas aeruginosa,Pseudomonas aeruginosa
1563,Klebsiella pneumoniae,Enterobacteriaceae,Enterobacteriaceae,Klebsiella pneumoniae,Klebsiella pneumoniae
6553,Acinetobacter baumannii,Bacteria,Bacteria,Acinetobacter baumannii,Acinetobacter baumannii
3522,Campylobacter jejuni,Campylobacter jejuni,Campylobacter jejuni,Campylobacter jejuni,Campylobacter jejuni
7193,,,,Escherichia phage 933W,Escherichia phage 933W
4930,Escherichia coli,Escherichia coli,Escherichia coli,Escherichia coli,Escherichia coli


In [26]:
df[["genbank_organsim_prot", "genbank_organsim_nuc", "refseq_organism"]]

Unnamed: 0,genbank_organsim_prot,genbank_organsim_nuc,refseq_organism
0,Pseudomonas aeruginosa PA38182,Pseudomonas aeruginosa PA38182,Pseudomonas aeruginosa PA38182
1,Burkholderia glumae,Burkholderia glumae,Burkholderia glumae
2,Paenibacillus sp. LC231,Paenibacillus sp. LC231,Paenibacillus sp. LC231
3,Providencia stuartii,Providencia stuartii,Providencia stuartii
4,Mycolicibacterium fortuitum,Mycolicibacterium fortuitum,Mycolicibacterium fortuitum
...,...,...,...
9184,Staphylococcus aureus subsp. aureus MRSA252,Staphylococcus aureus subsp. aureus MRSA252,
9185,Staphylococcus aureus subsp. aureus MRSA252,Staphylococcus aureus subsp. aureus MRSA252,
9186,Staphylococcus aureus subsp. aureus MRSA252,Staphylococcus aureus subsp. aureus MRSA252,
9187,Burkholderia mallei ATCC 23344,Burkholderia mallei ATCC 23344,


In [63]:
# 2651 cases where found organisms in genbank via protein, genbank via nuclotide and refseq do not match 

diff_organism_df = df.loc[(df["genbank_organsim_nuc"] != df["genbank_organsim_prot"]) | (df["genbank_organsim_nuc"] != df["refseq_organism"]), ["genbank_organsim_prot", 
                                                                                                                                                "genbank_organsim_nuc", 
                                                                                                                                                "refseq_organism"]]
diff_organism_df

Unnamed: 0,genbank_organsim_prot,genbank_organsim_nuc,refseq_organism
34,,Serratia marcescens,Serratia marcescens
36,,uncultured bacterium,uncultured bacterium
45,Paracoccus denitrificans PD1222,Paracoccus denitrificans PD1222,
52,,Escherichia coli APEC O1,Escherichia coli APEC O1
58,,Streptomyces xiaopingdaonensis,Streptomyces xiaopingdaonensis
...,...,...,...
9184,Staphylococcus aureus subsp. aureus MRSA252,Staphylococcus aureus subsp. aureus MRSA252,
9185,Staphylococcus aureus subsp. aureus MRSA252,Staphylococcus aureus subsp. aureus MRSA252,
9186,Staphylococcus aureus subsp. aureus MRSA252,Staphylococcus aureus subsp. aureus MRSA252,
9187,Burkholderia mallei ATCC 23344,Burkholderia mallei ATCC 23344,


In [66]:
# But genbank via nucleotid has non NaN -- genbank via nucleotid seems to be best: have a look at the exact differences 
diff_organism_df.isna().sum()

genbank_organsim_prot     363
genbank_organsim_nuc        0
refseq_organism          2396
dtype: int64

In [75]:
# As can be seen there are 35 issues where organism found via genbank (nucleotid) and via genbank (protein) are different 
# What should be done here? Are these alternative names -- Expert knowledge requiered 
diff_organism_df.loc[diff_organism_df["genbank_organsim_nuc"] != diff_organism_df["genbank_organsim_prot"], ["genbank_organsim_nuc", 
                                                                                                             "genbank_organsim_prot"]].dropna()

Unnamed: 0,genbank_organsim_nuc,genbank_organsim_prot
126,Salmonella virus Fels2,Salmonella enterica subsp. enterica serovar Ty...
599,Bacillus phage lambda Ba02,Bacillus anthracis str. Ames
5382,Escherichia coli,Escherichia coli K-12
5600,Peptoclostridium phage p630P2,Clostridioides difficile 630
5612,Bacillus phage phBC6A52,Bacillus cereus ATCC 14579
5833,Bacillus phage phBC6A52,Bacillus cereus ATCC 14579
6103,Bacillus phage phBC6A52,Bacillus cereus ATCC 14579
6106,Staphylococcus epidermidis RP62A phage SP-beta,Staphylococcus epidermidis RP62A
6109,Bacillus phage lambda Ba02,Bacillus phage lambda Ba03
6145,Salmonella virus Fels2,Salmonella enterica subsp. enterica serovar Ty...


In [71]:
# Furthermore there are 9 issues between genbank via nucleotide and refseq 
# Are these alternative names? Expert knowledge requiered 
diff_organism_df.loc[diff_organism_df["genbank_organsim_nuc"] != diff_organism_df["refseq_organism"], ["genbank_organsim_nuc", 
                                                                                                       "refseq_organism"]].dropna()

Unnamed: 0,genbank_organsim_nuc,refseq_organism
2020,Acinetobacter sp.,Acinetobacter baumannii
4017,Klebsiella michiganensis,Klebsiella michiganensis M5al
5010,Salmonella enterica,Salmonella enterica subsp. enterica serovar In...
5382,Escherichia coli,Escherichia coli K-12
6103,Bacillus phage phBC6A52,Bacillus cereus ATCC 14579
6106,Staphylococcus epidermidis RP62A phage SP-beta,Staphylococcus epidermidis RP62A
6109,Bacillus phage lambda Ba02,Bacillus anthracis str. Ames
6125,Cytobacillus massiliigabonensis,Bacillus massiliigabonensis
6991,Bacillus phage lambda Ba02,Bacillus anthracis str. Ames


I am going to choose the organism found by genbank via nucleotid as base organism, because it has no NaN and seems to be complete

In [102]:
# Looks like genbank strain nuc has more information but information is sometimes included in genbank_organism_nuc 
# Serratia marcescens MC620 has MC620 already included
df.loc[df["genbank_strain_nuc"] != df["genbank_strain_prot"], ["genbank_strain_nuc", "genbank_strain_prot", "genbank_organsim_nuc"]].sample(10, random_state=1)

Unnamed: 0,genbank_strain_nuc,genbank_strain_prot,genbank_organsim_nuc
2492,,,Stenotrophomonas maltophilia
8003,MS4,,Staphylococcus aureus
7040,,,Micromonospora zionensis
300,M16,,Stenotrophomonas maltophilia
8008,SWU02,,Streptococcus pneumoniae
6030,,,Escherichia coli
3656,,,Pseudomonas aeruginosa
107,MC620,,Serratia marcescens MC620
460,BWH49,,Escherichia coli
7224,,,Pseudomonas aeruginosa


In [160]:
# Genbank strain prot carrys either redundant information or no information at all -- genbank strain via nucleotid will therefore be selected
df.loc[(df["genbank_strain_nuc"] != df["genbank_strain_prot"]) & ~df["genbank_strain_nuc"].isna(), ["genbank_strain_nuc", "genbank_strain_prot", "genbank_organsim_nuc"]]

Unnamed: 0,genbank_strain_nuc,genbank_strain_prot,genbank_organsim_nuc
52,APEC O1,,Escherichia coli APEC O1
58,DUT 180,,Streptomyces xiaopingdaonensis
59,W2.3,,Serratia marcescens W2.3
62,DSM 12546,,Saccharospirillum impatiens DSM 12546
87,NRRL ISP-5461,,Streptomyces lydicus
...,...,...,...
8630,NG196,,Neisseria gonorrhoeae
8631,NG196,,Neisseria gonorrhoeae
8675,TUM15753,,Neisseria gonorrhoeae
8676,TUM15753,,Neisseria gonorrhoeae


To find the correct and full strain name I will combine "genbank strain via nucleotid" and "genbank organism via nucleotid", only if strain is not included in organism. Else organism itself will be selected.

In [214]:
def check_for_combination(df_row: pd.Series) -> bool:
    """
    Checks if genbank_organism and genbank_strain should be connected to one strain
    """
    if not isinstance(df_row["genbank_strain_nuc"], str): 
        # don't combine organism and strain if is NaN --> Is this correct? 
        return False
    return not (df_row["genbank_strain_nuc"] in df_row["genbank_organsim_nuc"])
    
    
df["strain"] = np.where(df.apply(check_for_combination , axis=1), 
                           df["genbank_organsim_nuc"] + " " + df["genbank_strain_nuc"], 
                           df["genbank_organsim_nuc"])
df[["genbank_strain_nuc", "genbank_organsim_nuc", "strain"]]

Unnamed: 0,genbank_strain_nuc,genbank_organsim_nuc,strain
0,PA38182,Pseudomonas aeruginosa PA38182,Pseudomonas aeruginosa PA38182
1,5091,Burkholderia glumae,Burkholderia glumae 5091
2,LC231,Paenibacillus sp. LC231,Paenibacillus sp. LC231
3,,Providencia stuartii,Providencia stuartii
4,FC1K,Mycolicibacterium fortuitum,Mycolicibacterium fortuitum FC1K
...,...,...,...
9184,MRSA252,Staphylococcus aureus subsp. aureus MRSA252,Staphylococcus aureus subsp. aureus MRSA252
9185,MRSA252,Staphylococcus aureus subsp. aureus MRSA252,Staphylococcus aureus subsp. aureus MRSA252
9186,MRSA252,Staphylococcus aureus subsp. aureus MRSA252,Staphylococcus aureus subsp. aureus MRSA252
9187,ATCC 23344,Burkholderia mallei ATCC 23344,Burkholderia mallei ATCC 23344


## Gene

In [28]:
df["refseq_gene"]

0       Pseudomonas aeruginosa PA38182 aac(2')-I(A267)
1                 Burkholderia glumae 5091 aac(2')-IIa
2                  Paenibacillus sp. LC231 aac(2')-IIb
3                      Providencia stuartii aac(2')-Ia
4              Mycobacterium fortuitum FC1K aac(2')-Ib
                             ...                      
9184                                               NaN
9185                                               NaN
9186                                               NaN
9187                                               NaN
9188                                               NaN
Name: refseq_gene, Length: 9189, dtype: object

## Search for connection to Wikidata

In [150]:
from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

In [151]:


def search_parent_taxon(df_row: pd.Series) -> str:
    """
    Searches in wikidata for a species of bacterium which matches the two first words of _organism
    Sometime we have abbreviations for example Achromobacter sp. -- nobody knows if this is rather Achromobacter spanius or Achromobacter spiritinus -- This is all rather unusable
    """
    parent = df_row._organism.split()[:2]
    abbreviation = False
    for i, word in enumerate(parent): 
        if word[-1] == ".": 
            abbreviation = True
            parent[i] = word[:-1]
    parent = " ".join(parent).lower()
    query = f"""SELECT ?item ?itemLabel ?itemDescription
    WHERE {{
      ?item rdfs:label ?label;
            schema:description "species of bacterium"@en.
      
      FILTER(LANG(?label) = "en" && CONTAINS(LCASE(?label), "{parent}"))
      
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    LIMIT 10
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert().get("results").get("bindings")
    results = [(item.get("item").get("value"), item.get("itemLabel").get("value")) for item in results]
    if abbreviation: 
        return [res[0] for res in results] if len(results) > 1 else results[0][0]
    else: 
        for res in results: 
            if res[1].lower() in parent: 
                return res[0]

dummy["parent taxon"] = dummy.apply(search_parent_taxon, axis=1)
dummy[["_organism", "_strain", "parent taxon"]]

Unnamed: 0,_organism,_strain,parent taxon
7235,Corynebacterium diphtheriae,NCTC13129,http://www.wikidata.org/entity/Q131649
1591,Escherichia coli,10158/10184,
8240,Staphylococcus aureus subsp. aureus Mu50,Mu50,http://www.wikidata.org/entity/Q188121
4597,Burkholderia multivorans,AU23919,http://www.wikidata.org/entity/Q4999018
7471,Streptococcus pneumoniae,,http://www.wikidata.org/entity/Q221179


In [152]:
dummy["parent taxon"].iat[2]

'http://www.wikidata.org/entity/Q188121'

In [None]:
# drop rows if _strain is empty and _organism has no numeric value in it -- expert knowledge required -- database shit 

dummy = dummy[np.logical_or(dummy["_strain"].astype(bool), dummy["_organism"].apply(lambda x: any(char.isdigit() for char in x)))]
dummy