The following script in Python unifies the different gene symbols returned by each software and database.       
        
Gene symbols were unified with a given list of BLEEs and Carbapenemases genes (BLEES and CPO, respectively).        
        
The gene symbols' have been previously modified using Excel for visual simplicity, with these modifications having been:        
 1) Removal of the string "bla" (beta-lactamase) at the beginning of the gene symbol, since this characters would appear in certain tools' and databases' outputs, but not in others.
 2) Replacement of the hyphen "-" for an underscore "_", since the use of those signs was mixed among the tools' and databases' outputs.
    

In [1]:
import os
import pandas as pd
import openpyxl
import json
import datetime


In [9]:
#define a function to edit the samples' names, which originally comes with the "_report.tsv" attached.
def trim_sample_name(data, column_to_trim, part_to_trim):
    data[column_to_trim] = data[column_to_trim].str.replace(part_to_trim, "", regex = False)
    return data

#define a function to apply the genes' nomenclatures equivalences to the data.
def apply_equivalences(df, dict_equivalences):
    columns = list(df.columns)
    df["gene_symbol_modified"] = df["gene_symbol"].map(dict_equivalences).fillna(df["gene_symbol"])
    if columns[2] != "gene_symbol_modified":
        columns.insert(2,"gene_symbol_modified")
        # columns.pop(-1)
        df = df[columns]
    else:
        print (f"the column gene_symbol_modified has already been added for this dataframe")
    return df

#the output in ABRicate-ResFinder is a little bit different from the one in ABRicate-card/ncbi, so we need to create a new function to apply the equivalences to the ABRicate-ResFinder output. ç
#the difference lays on the fact that the gene_symbol is in the column "gene_name" in the ABRicate-ResFinder output.
def apply_equivalences2(df, dict_equivalences):
    columns = list(df.columns)
    df["gene_symbol_modified"] = df["gene_name"].map(dict_equivalences).fillna(df["gene_name"])
    if columns[2] != "gene_symbol_modified":
        columns.insert(2,"gene_symbol_modified")
        df = df[columns]
    else:
        print (f"the column gene_symbol_modified has already been added for this dataframe")
    return df

In [2]:

#open the dictionaries of equivalences that have been prevoiusly created in Excel

file_path = "equivalencias.xlsx"

data_equivalences = pd.read_excel(file_path, sheet_name = "equivalencias")

keys_ariba = data_equivalences['all_unmodified_ARIBA']
values_ariba = data_equivalences['all_modified_ARIBA']

keys_abricate = data_equivalences['all_unmodified_ABRICATE']
values_abricate = data_equivalences['all_modified_ABRICATE']

keys_amrfinderplus = data_equivalences['all_unmodified_AMRFINDERPLUS']
values_amrfinderplus = data_equivalences['all_modified_AMRFINDERPLUS']

keys_rgi = data_equivalences['all_unmodified_RGI']
values_rgi = data_equivalences['all_modified_RGI']

keys_BLEE = data_equivalences['BLEE_unmodified']
values_BLEE = data_equivalences['BLEE_modified']

keys_CPO = data_equivalences['CPO_unmodified']
values_CPO = data_equivalences['CPO_modified']



In [3]:

#create dictionaries of equivalences for every tool and list
dict_eq_ariba = dict(zip(keys_ariba, values_ariba))
dict_eq_abricate = dict(zip(keys_abricate, values_abricate))
dict_eq_amrfinderplus = dict(zip(keys_amrfinderplus, values_amrfinderplus))
dict_eq_rgi = dict(zip(keys_rgi, values_rgi))

dict_eq_BLEE = dict(zip(keys_BLEE, values_BLEE))
dict_eq_CPO = dict(zip(keys_CPO, values_CPO))

In [5]:
#import hAMRonization reports as dataframes.
path_hamronization_data = "../20240701hAMRonization"

ariba_CARD = pd.read_csv(path_hamronization_data  + "/ariba/hamronized_ariba_CARD.tsv", sep = "\t")
ariba_NCBI = pd.read_csv(path_hamronization_data  + "/ariba/hamronized_ariba_NCBI.tsv", sep = "\t")
ariba_RESFINDER = pd.read_csv(path_hamronization_data  + "/ariba/hamronized_ariba_RESFINDER.tsv", sep = "\t")

abricate_CARD = pd.read_csv(path_hamronization_data  + "/abricate/ABRICATE_hamronization_combined_report_card.tsv", sep = "\t")
abricate_NCBI = pd.read_csv(path_hamronization_data + "/abricate/ABRICATE_hamronization_combined_report_ncbi.tsv", sep = "\t")
abricate_RESFINDER = pd.read_csv(path_hamronization_data + "/abricate/ABRICATE_hamronization_combined_report_resfinder.tsv", sep = "\t")

amrfinderplus_NCBI = pd.read_csv(path_hamronization_data + "/amrfinderplus/amrfinderplus_hamronization_combined_report_ncbi.tsv", sep = "\t")

rgi_CARD = pd.read_csv(path_hamronization_data + "/rgi/RGI_hamronization_combined_report_card.tsv", sep = "\t")


In [8]:
#group the reports belonging to the same tool. This dictionaries are only used to store the names and iterate
#through the variables. The necessary modifications on the dataframes are performed at the level of the global variable (ariba_CARD, ariba_NCBI, etc.)
reports_ariba = {
    "ariba_CARD": ariba_CARD,
    "ariba_NCBI": ariba_NCBI,
    "ariba_RESFINDER": ariba_RESFINDER
}

reports_abricate = {
    "abricate_CARD": abricate_CARD,
    "abricate_NCBI": abricate_NCBI,
    "abricate_RESFINDER": abricate_RESFINDER
}

all_reports = {
    "ariba_CARD": ariba_CARD,
    "ariba_NCBI": ariba_NCBI,
    "ariba_RESFINDER": ariba_RESFINDER,
    "abricate_CARD": abricate_CARD,
    "abricate_NCBI": abricate_NCBI,
    "abricate_RESFINDER": abricate_RESFINDER,
    "amrfinderplus_NCBI": amrfinderplus_NCBI,
    "rgi_CARD": rgi_CARD

}


In [11]:
#1. Trim the extra "report.tsv", ".txt", etc that comes with the sample_name in hAMROnization reports' column "input_file_name". This value may differ among the different tools,
#so the "part_to_trim" variable has to be adjusted.
#2. with the function apply_equivalences, create new columns with the modified gene symbols according to the equivalences dictionaries.

for key, value in reports_ariba.items():
    globals()[key] = apply_equivalences(trim_sample_name(value, "input_file_name", "_report.tsv") , dict_eq_ariba)

for key, value in reports_abricate.items():
    if key != "abricate_RESFINDER":
        globals()[key] = apply_equivalences(trim_sample_name(value, "input_file_name", "_report.tsv"), dict_eq_abricate)
    elif key == "abricate_RESFINDER":
        globals()[key] = apply_equivalences2(trim_sample_name(value, "input_file_name", "_report.tsv"), dict_eq_abricate)

rgi_CARD = apply_equivalences(trim_sample_name(rgi_CARD, "input_file_name", ".txt.rgi"), dict_eq_rgi)
amrfinderplus_NCBI = apply_equivalences(trim_sample_name(amrfinderplus_NCBI, "input_file_name", ".tsv.amrfinderplus"), dict_eq_amrfinderplus)

In [20]:
#remove duplicate genes in each column
for key, value in all_reports.items():
    all_reports[key] = value.drop_duplicates(subset = ["input_file_name", "gene_symbol_modified"])
    globals()[key] = all_reports[key]

In [13]:
#define matrices of equivalences for BLEE and CPOs.
df_eq_BLEE = pd.DataFrame(dict_eq_BLEE.items(), columns = ["blee_unmodified", "blee_modified"])
df_eq_CPO = pd.DataFrame(dict_eq_CPO.items(), columns = ["cpo_unmodified", "cpo_modified"])

In [21]:
#define and create a directory to store the dataframes with the modified gene column added.

path_modified_reports = "reports_modified"
currentdate = datetime.datetime.now().strftime("%Y%m%d")


if not os.path.exists(path_modified_reports):
    os.mkdir(path_modified_reports)

#save dataframes with the modified gene column added.
for key, value in reports_ariba.items():
    file = globals()[key]
    file.to_csv(os.path.join(path_modified_reports + "/" + currentdate + key + "_modified.tsv"), index = False, sep = "\t")

for key, value in reports_abricate.items():
    file = globals()[key]
    file.to_csv(os.path.join(path_modified_reports + "/" + currentdate + key + "_modified.tsv"), index = False, sep = "\t")

rgi_CARD.to_csv(os.path.join(path_modified_reports + "/" + currentdate + "rgi_CARD_modified.tsv"), index = False, sep = "\t")
amrfinderplus_NCBI.to_csv(os.path.join(path_modified_reports + "/" + currentdate + "amrfinderplus_NCBI_modified.tsv"), index = False, sep = "\t")


In [23]:
#save reference lists
df_eq_BLEE.to_csv(os.path.join(path_modified_reports + "/" + currentdate + "eq_BLEE.tsv"), index = False, sep = "\t")
df_eq_CPO.to_csv(os.path.join(path_modified_reports + "/" + currentdate  + "eq_CPO.tsv"), index = False, sep = "\t")