In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
prevalence = pd.read_csv("data/card_prevalence.txt", sep="\t")

In [3]:
prevalence.head()

Unnamed: 0,ARO Accession,Name,Model ID,Model Type,Pathogen,NCBI Plasmid,NCBI WGS,NCBI Chromosome,NCBI Genomic Island,Criteria,ARO Categories
0,ARO:3002501,PDC-4,1,protein homolog model,Pseudomonas aeruginosa,0.0,0.04,0.0,0.0,perfect_strict,antibiotic inactivation; carbapenem; cephalosp...
1,ARO:3002999,CblA-1,2,protein homolog model,Phocaeicola dorei,0.0,1.92,0.0,0.0,perfect,antibiotic inactivation; cephalosporin
2,ARO:3002999,CblA-1,2,protein homolog model,Phocaeicola dorei,0.0,1.92,0.0,0.0,perfect_strict,antibiotic inactivation; cephalosporin
3,ARO:3001109,SHV-52,4,protein homolog model,Klebsiella pneumoniae,0.0,0.15,0.0,0.0,perfect,antibiotic inactivation; carbapenem; cephalosp...
4,ARO:3001109,SHV-52,4,protein homolog model,Klebsiella pneumoniae,0.0,0.16,0.0,0.0,perfect_strict,antibiotic inactivation; carbapenem; cephalosp...


In [12]:
prevalence["Model Type"].unique()

array(['protein homolog model', 'protein variant model',
       'protein overexpression model', 'rRNA gene variant model'],
      dtype=object)

In [4]:
aro_file = "data/aro.obo"


content = open(aro_file, 'r').read()


In [5]:
rx_term = re.compile(r"(\[Term\]\n.+?\n\n)", re.DOTALL)

terms = re.findall(rx_term, content)


In [17]:
term_dict= {}

rx_key = re.compile(r'(\w+): (.+)')

for term in terms:
    term = term.strip()
    fields = term.split("\n")

    temp_dict= {}

    for field in fields:
        #print (field)
        key_match = rx_key.match(field)

        if key_match:
            key = key_match.group(1)
            value = key_match.group(2)

            if key in ['synonym', 'is_a', 'relationship']:
                if key not in temp_dict:
                    temp_dict[key] = []
                temp_dict[key].append(value)
            else:
                if key == "def":
                    value = value.replace('"', "")
                temp_dict[key] = value
    
    term_dict[temp_dict["id"]] = temp_dict



In [18]:
term_dict["ARO:0000020"]

{'id': 'ARO:0000020',
 'name': 'carbapenem',
 'namespace': 'antibiotic_resistance',
 'def': 'Carbapenems are a class of beta-lactam antibiotics with a broad spectrum of antibacterial activity, and have a structure which renders them highly resistant to beta-lactamases. Carbapenem antibiotics are bactericidal, and act by inhibiting the synthesis of the peptidoglycan layer of bacterial cell walls. The peptidoglycan layer is important for cell wall structural integrity, especially in Gram-positive organisms. [PMID:11585791, PMID:15673804]',
 'is_a': ['ARO:3000007 ! beta-lactam antibiotic']}

In [19]:
term_dict["ARO:3001109"]

{'id': 'ARO:3001109',
 'name': 'SHV-52',
 'namespace': 'antibiotic_resistance',
 'def': 'SHV-52 is a beta-lactamase that has been found in clinical isolates. []',
 'is_a': ['ARO:3000015 ! SHV beta-lactamase']}

In [20]:
resistances = set()
pathogens = set()

pathogen_resistance = f"pathogen,resistance,criteria,model_type\n"

for index, row in prevalence.iterrows():
    #print(row['c1'], row['c2'])
    pathogen = row["Pathogen"] 
    resistance = row["ARO Accession"]
    criteria = row["Criteria"]
    model_type = row["Model Type"]

    pathogens.add(pathogen)
    resistances.add(resistance)

    pathogen_resistance += f'"{pathogen}","{resistance}","{criteria}","{model_type}"\n'

output = open("data_for_neo4j/pathogen_resistance.csv", 'w')
output.write(pathogen_resistance)
output.close()


In [22]:
resistance_drug = f"resistance,drug\n"
drug = f"aro,name,definition,is_a\n"
drugs = set()
resistance = f"aro,name,definition,is_a\n"
pathogen = f"name\n"

for id in term_dict:
    term = term_dict[id]

    if "relationship" in term:
        for r in term["relationship"]:
            if r.startswith("confers_resistance_to_drug_class"):
                s = r.replace("confers_resistance_to_drug_class", "").strip()
                fields = s.split("!")

                aro_id = fields[0].strip()

                resistance_drug += f'"{id}","{aro_id}"\n'

                #drug += f"{aro_id},{term_dict[aro_id]['name']}\n"
                drugs.add(aro_id)
                
output = open("data_for_neo4j/resistance_drug.csv", 'w')
output.write(resistance_drug)
output.close()


for aro_id in drugs:
    name = term_dict[aro_id]['name']
    definition = term_dict[aro_id]['def']
    is_a = term_dict[aro_id]['is_a']

    drug += f'"{aro_id}","{name}","{definition}","{is_a}"\n'
output = open("data_for_neo4j/drug.csv", 'w')
output.write(drug)
output.close()

for res in resistances:
    name = term_dict[res]['name']
    definition = term_dict[aro_id]['def']
    is_a = term_dict[aro_id]['is_a']

    resistance += f'"{res}","{name}","{definition}","{is_a}"\n'

output = open("data_for_neo4j/resistance.csv", 'w')
output.write(resistance)
output.close()

for p in pathogens:
    pathogen += f"{p}\n"

output = open("data_for_neo4j/pathogen.csv", 'w')
output.write(pathogen)
output.close()