In [6]:
import pandas as pd
import numpy as np
import re

In [7]:
prevalence = pd.read_csv("card_prevalence.txt", sep="\t")

In [8]:
prevalence.head()

Unnamed: 0,ARO Accession,Name,Model ID,Model Type,Pathogen,NCBI Plasmid,NCBI WGS,NCBI Chromosome,NCBI Genomic Island,Criteria,ARO Categories
0,ARO:3003390,Pseudomonas aeruginosa ompF with mutation,3,protein variant model,Escherichia coli,0.0,0.01,0.0,0.0,perfect_strict,carbapenem; cephalosporin; cephamycin; monobac...
1,ARO:3001109,SHV-52,4,protein homolog model,Escherichia coli,0.0,0.01,0.0,0.0,perfect_strict,antibiotic inactivation; carbapenem; cephalosp...
2,ARO:3001109,SHV-52,4,protein homolog model,Klebsiella pneumoniae,0.0,0.17,0.0,0.0,perfect,antibiotic inactivation; carbapenem; cephalosp...
3,ARO:3001109,SHV-52,4,protein homolog model,Staphylococcus aureus,0.0,0.01,0.0,0.0,perfect_strict,antibiotic inactivation; carbapenem; cephalosp...
4,ARO:3001109,SHV-52,4,protein homolog model,Enterobacter hormaechei,0.0,0.08,0.0,0.0,perfect_strict,antibiotic inactivation; carbapenem; cephalosp...


In [9]:
aro_file = "aro.obo"



content = open(aro_file, 'r').read()


In [10]:
rx_term = re.compile(r"(\[Term\]\n.+?\n\n)", re.DOTALL)

terms = re.findall(rx_term, content)


In [11]:
term_dict= {}

rx_key = re.compile(r'(\w+): (.+)')

for term in terms:
    term = term.strip()
    fields = term.split("\n")

    temp_dict= {}

    for field in fields:
        #print (field)
        key_match = rx_key.match(field)

        if key_match:
            key = key_match.group(1)
            value = key_match.group(2)

            if key in ['synonym', 'is_a', 'relationship']:
                if key not in temp_dict:
                    temp_dict[key] = []
                temp_dict[key].append(value)
            else:
                temp_dict[key] = value
    
    term_dict[temp_dict["id"]] = temp_dict



In [14]:
term_dict["ARO:0000020"]

{'id': 'ARO:0000020',
 'name': 'carbapenem',
 'namespace': 'antibiotic_resistance',
 'def': '"Carbapenems are a class of beta-lactam antibiotics with a broad spectrum of antibacterial activity, and have a structure which renders them highly resistant to beta-lactamases. Carbapenem antibiotics are bactericidal, and act by inhibiting the synthesis of the peptidoglycan layer of bacterial cell walls. The peptidoglycan layer is important for cell wall structural integrity, especially in Gram-positive organisms." [PMID:11585791, PMID:15673804]',
 'is_a': ['ARO:3000007 ! beta-lactam antibiotic']}

In [24]:
term_dict["ARO:3001109"]

{'id': 'ARO:3001109',
 'name': 'SHV-52',
 'namespace': 'antibiotic_resistance',
 'def': '"SHV-52 is a beta-lactamase that has been found in clinical isolates." []',
 'is_a': ['ARO:3000015 ! SHV beta-lactamase']}

In [25]:
resistances = set()
pathogens = set()

pathogen_resistance = f"pathogen\tresistance\n"

for index, row in prevalence.iterrows():
    #print(row['c1'], row['c2'])
    pathogen = row["Pathogen"] 
    resistance = row["ARO Accession"]

    pathogens.add(pathogen)
    resistances.add(resistance)

    pathogen_resistance += f"{pathogen}\t{resistance}\n"


In [26]:
resistance_drug = f"from\tto\n"
drug = f"aro\tname\tdefinition\tis_a\n"
drugs = set()
resistance = f"aro\tname\tdefinition\tis_a"
pathogen = f"name\n"

for id in term_dict:
    term = term_dict[id]

    if "relationship" in term:
        for r in term["relationship"]:
            if r.startswith("confers_resistance_to_drug_class"):
                s = r.replace("confers_resistance_to_drug_class", "").strip()
                fields = s.split("!")

                aro_id = fields[0].strip()

                resistance_drug += f"{id}\t{aro_id}\n"

                #drug += f"{aro_id},{term_dict[aro_id]['name']}\n"
                drugs.add(aro_id)

for aro_id in drugs:
    drug += f"{aro_id}\t{term_dict[aro_id]['name']}\t{term_dict[aro_id]['def']}\t{term_dict[aro_id]['is_a']}\n"


for res in resistances:
    resistance += f"{res}\t{term_dict[res]['name']}\t{term_dict[aro_id]['def']}\t{term_dict[aro_id]['is_a']}\n"

for p in pathogens:
    pathogen += f"{p}\n"