Build dataframe of all CPOs/BLEEs detected (seqID>=99%) in at least one of the methods.

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import datetime
from plotnine import *
import matplotlib.pyplot as plt

In [3]:
#Import reference lists.
data_BLEE = pd.read_csv("../reports_modified/20240805eq_BLEE.tsv", sep = "\t")
data_CPO = pd.read_csv("../reports_modified/20240805eq_CPO.tsv", sep = "\t")

list_BLEE = data_BLEE['blee_modified'].to_list()
list_CPO = data_CPO['cpo_modified'].to_list()

In [4]:
#define fuction to determine gene type (BLEE, CPO, Other)
def determine_gene_type(gene):
    if gene in list_BLEE:
        return "BLEE"
    elif gene in list_CPO:
        return "CPO"
    else:
        return "Other"

2. Open the reports filtered for seq ID >= 99% and with the column BLEE/CPO added (open as dictionary, then align global variables with dictionary's items).

In [5]:
filtered_reports = {}
list_of_reports = ["ariba_CARD", 
                   "ariba_NCBI",
                    "ariba_RESFINDER",
                    "abricate_CARD", 
                    "abricate_NCBI",
                    "abricate_RESFINDER",
                    "amrfinderplus_NCBI",
                    "rgi_CARD"]
print("opening files:")
for f in list_of_reports:
    filtered_reports[f] = pd.read_csv(os.path.join("../reports_filtered99/20240803" +  f + ".tsv"), sep = "\t")
    print(f)


opening files:
ariba_CARD
ariba_NCBI
ariba_RESFINDER
abricate_CARD
abricate_NCBI
abricate_RESFINDER
amrfinderplus_NCBI
rgi_CARD


In [6]:
#determine gene type for each report
for key, value in filtered_reports.items():
    value['gene_type'] = value['gene_symbol_modified'].apply(determine_gene_type)
    value['Tool'] = key

In [7]:
list_of_reports = [value for _, value in filtered_reports.items()]
list_of_reports[0].head()


Unnamed: 0,input_file_name,gene_symbol,gene_name,reference_database_name,reference_database_version,reference_accession,analysis_software_name,analysis_software_version,genetic_variation_type,antimicrobial_agent,...,reference_gene_stop,reference_protein_length,reference_protein_start,reference_protein_stop,resistance_mechanism,strand_orientation,sequence_identity,gene_symbol_modified,gene_type,Tool
0,A0101KPN,AAC_3__IIe,AAC_3__IIe.3004621.EU022315.1.0_861.5340,card,2.14.6,AAC_3__IIe.3004621.EU022315.1.0_861.5340,ariba,1.1.4,gene_presence_detected,,...,,,,,,,99.54,AAC_3__IIe,Other,ariba_CARD
1,A0101KPN,AAC_6___Ib_cr7,AAC_6___Ib_cr7.3005117.JABGAB010000032.1.11074...,card,2.14.6,AAC_6___Ib_cr7.3005117.JABGAB010000032.1.11074...,ariba,1.1.4,gene_presence_detected,,...,,,,,,,99.82,AAC_6___Ib_cr7,Other,ariba_CARD
2,A0101KPN,APH_3____Ib,APH_3____Ib.3002639.AF313472.2.15593_16397.8275,card,2.14.6,APH_3____Ib.3002639.AF313472.2.15593_16397.8275,ariba,1.1.4,gene_presence_detected,,...,,,,,,,99.75,APH_3____Ib,Other,ariba_CARD
3,A0101KPN,APH_6__Id,APH_6__Id.3002660.AF024602.1.3155_3992.467,card,2.14.6,APH_6__Id.3002660.AF024602.1.3155_3992.467,ariba,1.1.4,gene_presence_detected,,...,,,,,,,99.88,APH_6__Id,Other,ariba_CARD
4,A0101KPN,ArnT,ArnT.3005053.FO834906.1.304977_306633.6090,card,2.14.6,ArnT.3005053.FO834906.1.304977_306633.6090,ariba,1.1.4,gene_presence_detected,,...,,,,,,,99.28,ArnT,Other,ariba_CARD


In [9]:
# Create a single dataframe with all the reports
all_reports = pd.concat(list_of_reports).reset_index(drop = True)
all_reports.head()

Unnamed: 0,input_file_name,gene_symbol,gene_name,reference_database_name,reference_database_version,reference_accession,analysis_software_name,analysis_software_version,genetic_variation_type,antimicrobial_agent,...,reference_gene_stop,reference_protein_length,reference_protein_start,reference_protein_stop,resistance_mechanism,strand_orientation,sequence_identity,gene_symbol_modified,gene_type,Tool
0,A0101KPN,AAC_3__IIe,AAC_3__IIe.3004621.EU022315.1.0_861.5340,card,2.14.6,AAC_3__IIe.3004621.EU022315.1.0_861.5340,ariba,1.1.4,gene_presence_detected,,...,,,,,,,99.54,AAC_3__IIe,Other,ariba_CARD
1,A0101KPN,AAC_6___Ib_cr7,AAC_6___Ib_cr7.3005117.JABGAB010000032.1.11074...,card,2.14.6,AAC_6___Ib_cr7.3005117.JABGAB010000032.1.11074...,ariba,1.1.4,gene_presence_detected,,...,,,,,,,99.82,AAC_6___Ib_cr7,Other,ariba_CARD
2,A0101KPN,APH_3____Ib,APH_3____Ib.3002639.AF313472.2.15593_16397.8275,card,2.14.6,APH_3____Ib.3002639.AF313472.2.15593_16397.8275,ariba,1.1.4,gene_presence_detected,,...,,,,,,,99.75,APH_3____Ib,Other,ariba_CARD
3,A0101KPN,APH_6__Id,APH_6__Id.3002660.AF024602.1.3155_3992.467,card,2.14.6,APH_6__Id.3002660.AF024602.1.3155_3992.467,ariba,1.1.4,gene_presence_detected,,...,,,,,,,99.88,APH_6__Id,Other,ariba_CARD
4,A0101KPN,ArnT,ArnT.3005053.FO834906.1.304977_306633.6090,card,2.14.6,ArnT.3005053.FO834906.1.304977_306633.6090,ariba,1.1.4,gene_presence_detected,,...,,,,,,,99.28,ArnT,Other,ariba_CARD


In [10]:
# Get only BLEEs and CPOs
all_reports_blee_cpo = all_reports[all_reports['gene_type'] != "Other"].reset_index(drop = True)
print(f"total number of genes: {len(all_reports)}, number of BLEEs and CPOs: {len(all_reports_blee_cpo)}")

total number of genes: 9347, number of BLEEs and CPOs: 1972


In [18]:
all_reports_blee_cpo.to_csv("../output_dir/merged_df_blees_cpos_f99.tsv", sep = "\t", index = False)