In [66]:
import pandas as pd
import os
from Bio import SeqIO

In [67]:
def parse_tblout(tblout_file):
    data = []
    with open(tblout_file, 'r') as f:
        for line in f:
            line = line.strip()
            if not line.startswith('#'):  # Skip comment lines
                columns = line.split()
                # Assuming you're interested in the first three columns (accession, E-value, bitscore)
                target_name = columns[0]
                query_name = columns[2]  # Assuming E-value is in the 5th column
                e_value = float(columns[12])  # Assuming bitscore is in the 6th column
                data.append([target_name, query_name, e_value])
    df = pd.DataFrame(data, columns=['target_name', 'query_name', 'e_value'])
    return df

In [68]:
amrfinder_arscan = parse_tblout('testing_outputs/hmmer/amrfinder_bacarscan.out')
keep_indexs = []
for query in amrfinder_arscan['query_name'].unique():
    subset = amrfinder_arscan.loc[amrfinder_arscan['query_name'] == query]
    index = subset['e_value'].idxmin()
    keep_indexs.append(index)
amrfinder_arscan_top = amrfinder_arscan[amrfinder_arscan.index.isin(keep_indexs)]
print(len(amrfinder_arscan_top))
print(len(amrfinder_arscan_top['target_name'].unique()))

20
17


In [69]:
card_arscan = parse_tblout('testing_outputs/hmmer/card_bacarscan.out')
keep_indexs = []
for query in card_arscan['query_name'].unique():
    subset = card_arscan.loc[card_arscan['query_name'] == query]
    index = subset['e_value'].idxmin()
    keep_indexs.append(index)
card_arscan_top = card_arscan[card_arscan.index.isin(keep_indexs)]
print(len(card_arscan_top))
print(len(card_arscan_top['target_name'].unique()))

369
136


In [70]:
resfinder_arscan = parse_tblout('testing_outputs/hmmer/resfinder_bacarscan.out')
for query in resfinder_arscan['query_name'].unique():
    subset = resfinder_arscan.loc[resfinder_arscan['query_name'] == query]
    index = subset['e_value'].idxmin()
    keep_indexs.append(index)
resfinder_arscan_top = resfinder_arscan[resfinder_arscan.index.isin(keep_indexs)]
print(len(resfinder_arscan_top))
print(len(resfinder_arscan_top['target_name'].unique()))

48
31


In [71]:
CARD_out_dir = 'testing_outputs/CARD/'

card_DFs = {}
for filename in os.listdir(CARD_out_dir):
    # Check if the entry is a file
    filepath = os.path.join(CARD_out_dir, filename)
    if os.path.isfile(filepath):
        # Process the file
        print("Processing file:", filename)
        name=filename.split('_')[0]
        card_out = pd.read_csv(filepath, sep='\t')
        card_out['Percentage Length of Reference Sequence'] = card_out['Percentage Length of Reference Sequence'].astype(float)
        car_out_pass = card_out.loc[card_out['Percentage Length of Reference Sequence'] >= 80]
        card_DFs[name]=car_out_pass

Processing file: barcode01_out.txt
Processing file: barcode02_out.txt
Processing file: barcode03_out.txt
Processing file: barcode04_out.txt
Processing file: barcode05_out.txt
Processing file: barcode06_out.txt
Processing file: barcode07_out.txt
Processing file: barcode08_out.txt
Processing file: barcode09_out.txt
Processing file: barcode10_out.txt
Processing file: barcode11_out.txt
Processing file: barcode12_out.txt


In [72]:
len(card_DFs['barcode01'])

4

In [73]:
updated_pass_card = {}
for name,df in card_DFs.items():
    arscan = []
    for ARO in df['ARO']:
        subset =card_arscan_top.loc[card_arscan_top['query_name'] == str(ARO)]
        if len(subset) > 0:
            arscan.append(subset.iloc[0]['target_name'])
        else:
            arscan.append('N/A')
    df['ARScan_Term'] = arscan
    updated_pass_card[name] = df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ARScan_Term'] = arscan


In [74]:
os.makedirs('Filtered_Card_Outputs', exist_ok=True)
for name,df in updated_pass_card.items():
    filename = name + '_filtered_card.csv'
    df.to_csv('Filtered_Card_Outputs/'+filename, index=False)

In [76]:
card_summary = pd.DataFrame()
all_arscan_terms = []
for name,df in updated_pass_card.items():
    for term in df['ARScan_Term']:
        all_arscan_terms.append(term)
unique_all_arscan_terms = list(set(all_arscan_terms))
card_summary['ARScan_Term'] = unique_all_arscan_terms
for name,df in updated_pass_card.items():
    count_data = []
    for term in card_summary['ARScan_Term']:  
        subset = df.loc[df['ARScan_Term'] == term]
        term_count =len(subset)
        count_data.append(term_count)
    card_summary[name] = count_data

In [78]:
card_summary.head()

Unnamed: 0,ARScan_Term,barcode01,barcode02,barcode03,barcode04,barcode05,barcode06,barcode07,barcode08,barcode09,barcode10,barcode11,barcode12
0,bAR1352,0,0,0,0,0,0,0,0,0,0,1,0
1,bAR1296,0,0,0,0,0,0,0,0,0,0,1,0
2,bAR1109,0,0,0,0,0,0,0,0,0,0,0,1
3,bAR1348,0,0,0,0,1,0,0,0,0,0,0,1
4,bAR1010,0,0,0,0,0,0,0,0,0,1,1,0


In [79]:
ARScan_reference_df = pd.read_excel('testing_outputs/hmmer/BacRAScan_Reference_DB.xlsx')

  for idx, row in parser.parse():


In [81]:
ARScan_reference_df.head()

Unnamed: 0,BacARscan ID,Corresponding Uniprot ID,AMR Gene Term,AMR Gene name,Antibiotic Resistant,Antibiotic Class,Antimicrobial Spectrum,AMR Definition,Resistance Mechanism,AMR Protein names,AMR families,AMR Function
0,bAR1099,P58777,cata1,cat,"Phenicol, Chloramphenicol","Antibiotics, Other",Broad-Spectrum,"Group A chloramphenicol acetyltransferase, whi...",Antibiotic Inactivation,Chloramphenicol acetyltransferase (CAT) (EC 2....,Chloramphenicol acetyltransferase family,This enzyme is an effector of chloramphenicol...
1,bAR1109,Q52424,aac2ia,aac,,Aminoglycosides,Broad-Spectrum,"Aminoglycoside N-acetyltransferase, which modi...",Antibiotic Inactivation,Aminoglycoside 2'-N-acetyltransferase (EC 2.3....,AAC(2')-I acetyltransferase family,Catalyzes the coenzyme A-dependent acetylatio...
2,bAR1114,P22782,cata13,,"Phenicol, Chloramphenicol\n","Antibiotics, Other",Broad-Spectrum,"Group A chloramphenicol acetyltransferase, wh...",Antibiotic Inactivation,Chloramphenicol acetyltransferase (CAT) (EC 2....,Chloramphenicol acetyltransferase family,This enzyme is an effector of chloramphenicol...
3,bAR1161,Q49157,aac2ib,aac,"6_n_netilmicin, dibekacin, gentamicin, netilmi...",Aminoglycosides,Broad-Spectrum,"Aminoglycoside Nacetyltransferase, which modif...",Antibiotic Inactivation,Aminoglycoside 2Nacetyltransferase (EC 2.3.1.)...,AAC(2)I acetyltransferase family,"Confers resistance to gentamicin, tobramycin,..."
4,bAR1208,P30180,aac3vii,aacC7,"neomycin, ribostamycin",Aminoglycosides,Broad-Spectrum,"Aminoglycoside Nacetyltransferase, which modif...",Antibiotic Inactivation,Aminoglycoside N(3)acetyltransferase VII (EC 2...,Antibiotic Nacetyltransferase family,Resistance to paromomycin.


In [84]:
Resistance = []
Class = []
AMR_Def = []
Res_Mec = []
Uniprot = []
for term in card_summary['ARScan_Term']:
    if term == 'N/A':
        Resistance.append('N/A')
        Class.append('N/A')
        AMR_Def.append('N/A')
        Res_Mec.append('N/A')
        Uniprot.append('N/A')
    else:
        subset = ARScan_reference_df.loc[ARScan_reference_df['BacARscan ID'] == term]
        if len(subset) > 0:
            Resistance.append(subset.iloc[0]['Antibiotic Resistant'])
            Class.append(subset.iloc[0]['Antibiotic Class'])
            AMR_Def.append(subset.iloc[0]['AMR Definition'])
            Res_Mec.append(subset.iloc[0]['Resistance Mechanism'])
            Uniprot.append(subset.iloc[0]['Corresponding Uniprot ID'])
        else:
            print('problem')
            print(subset)
            print(real_term)
            Resistance.append('N/A')
            Class.append('N/A')
            AMR_Def.append('N/A')
            Res_Mec.append('N/A')
            Uniprot.append('N/A')
card_summary['Antibiotic_Resistant'] = Resistance
card_summary['Antibiotic_class'] = Class
card_summary['AMR_Definition'] = AMR_Def
card_summary['Resistance_Mechanism'] = Res_Mec
card_summary['Corresponding_Uniprot ID'] = Uniprot

In [86]:
card_summary.to_csv('Card_BacARScan_summary.csv', index=False)

In [88]:
amrfinder_out_dir = 'testing_outputs/AMRFINDER/'

amrfinder_DFs = {}
for filename in os.listdir(amrfinder_out_dir):
    # Check if the entry is a file
    filepath = os.path.join(amrfinder_out_dir, filename)
    if os.path.isfile(filepath):
        # Process the file
        print("Processing file:", filename)
        name=filename.split('_')[0]
        amrfinder_out = pd.read_csv(filepath, sep='\t')
        amrfinder_out['% Coverage of reference sequence'] = amrfinder_out['% Coverage of reference sequence'].astype(float)
        amrfinder_out = amrfinder_out.loc[amrfinder_out['% Coverage of reference sequence'] >= 80]
        amrfinder_DFs[name]=amrfinder_out
    else:
            print('problem')
            print(subset)
            print(real_term)
            Resistance.append('N/A')
            Class.append('N/A')
            AMR_Def.append('N/A')
            Res_Mec.append('N/A')
            Uniprot.append('N/A')

Processing file: barcode01_AMRFinder.tsv
Processing file: barcode02_AMRFinder.tsv
Processing file: barcode03_AMRFinder.tsv
Processing file: barcode04_AMRFinder.tsv
Processing file: barcode05_AMRFinder.tsv
Processing file: barcode06_AMRFinder.tsv
Processing file: barcode07_AMRFinder.tsv
Processing file: barcode08_AMRFinder.tsv
Processing file: barcode09_AMRFinder.tsv
Processing file: barcode10_AMRFinder.tsv
Processing file: barcode11_AMRFinder.tsv
Processing file: barcode12_AMRFinder.tsv


In [91]:
updated_pass_amrfinder = {}
for name,df in amrfinder_DFs.items():
    arscan = []
    for Acc in df['Accession of closest sequence']:
        subset =amrfinder_arscan_top.loc[amrfinder_arscan_top['query_name'] == str(Acc)]
        if len(subset) > 0:
            arscan.append(subset.iloc[0]['target_name'])
        else:
            arscan.append('N/A')
    df['ARScan_Term'] = arscan
    updated_pass_amrfinder[name] = df

In [97]:
os.makedirs('Filtered_AMRFinder_Outputs', exist_ok=True)
for name,df in updated_pass_amrfinder.items():
    filename = name + '_filtered_armfinder.csv'
    df.to_csv('Filtered_AMRFinder_Outputs/'+filename, index=False)

In [98]:
amrfinder_summary = pd.DataFrame()
all_arscan_terms = []
for name,df in updated_pass_amrfinder.items():
    for term in df['ARScan_Term']:
        all_arscan_terms.append(term)
unique_all_arscan_terms = list(set(all_arscan_terms))
amrfinder_summary['ARScan_Term'] = unique_all_arscan_terms
for name,df in updated_pass_amrfinder.items():
    count_data = []
    for term in amrfinder_summary['ARScan_Term']:  
        subset = df.loc[df['ARScan_Term'] == term]
        term_count =len(subset)
        count_data.append(term_count)
    amrfinder_summary[name] = count_data

In [108]:
Resistance = []
Class = []
AMR_Def = []
Res_Mec = []
Uniprot = []
for term in amrfinder_summary['ARScan_Term']:
    if term == 'N/A':
        Resistance.append('N/A')
        Class.append('N/A')
        AMR_Def.append('N/A')
        Res_Mec.append('N/A')
        Uniprot.append('N/A')
    else:
        real_term = term.split('.')[0]
        subset = ARScan_reference_df.loc[ARScan_reference_df['BacARscan ID'] == real_term]
        if len(subset) > 0:
            Resistance.append(subset.iloc[0]['Antibiotic Resistant'])
            Class.append(subset.iloc[0]['Antibiotic Class'])
            AMR_Def.append(subset.iloc[0]['AMR Definition'])
            Res_Mec.append(subset.iloc[0]['Resistance Mechanism'])
            Uniprot.append(subset.iloc[0]['Corresponding Uniprot ID'])
        else:
            print('problem')
            print(subset)
            print(real_term)
            Resistance.append('N/A')
            Class.append('N/A')
            AMR_Def.append('N/A')
            Res_Mec.append('N/A')
            Uniprot.append('N/A')
amrfinder_summary['Antibiotic_Resistant'] = Resistance
amrfinder_summary['Antibiotic_class'] = Class
amrfinder_summary['AMR_Definition'] = AMR_Def
amrfinder_summary['Resistance_Mechanism'] = Res_Mec
amrfinder_summary['Corresponding_Uniprot ID'] = Uniprot

problem
Empty DataFrame
Columns: [BacARscan ID, Corresponding Uniprot ID, AMR Gene Term, AMR Gene name, Antibiotic Resistant, Antibiotic Class, Antimicrobial Spectrum, AMR Definition, Resistance Mechanism, AMR Protein names, AMR families, AMR Function]
Index: []
bAR1325


In [109]:
amrfinder_summary.to_csv('AMRFinder_BacARScan_summary.csv', index=False)

In [111]:
resfinder_out_dir = 'testing_outputs/Resfinder/'

resfinder_DFs = {}
for filename in os.listdir(resfinder_out_dir):
    # Check if the entry is a file
    filepath = os.path.join(resfinder_out_dir, filename)
    if os.path.isfile(filepath):
        # Process the file
        print("Processing file:", filename)
        name=filename.split('_')[0]
        resfinder_out = pd.read_csv(filepath, sep='\t')
        resfinder_out['Coverage'] = resfinder_out['Coverage'].astype(float)
        resfinder_out = resfinder_out.loc[resfinder_out['Coverage'] >= 80]
        resfinder_DFs[name]=resfinder_out

Processing file: barcode01_ResFinder_results_tab.txt
Processing file: barcode02_ResFinder_results_tab.txt
Processing file: barcode03_ResFinder_results_tab.txt
Processing file: barcode04_ResFinder_results_tab.txt
Processing file: barcode05_ResFinder_results_tab.txt
Processing file: barcode06_ResFinder_results_tab.txt
Processing file: barcode07_ResFinder_results_tab.txt
Processing file: barcode08_ResFinder_results_tab.txt
Processing file: barcode09_ResFinder_results_tab.txt
Processing file: barcode10_ResFinder_results_tab.txt
Processing file: barcode11_ResFinder_results_tab.txt
Processing file: barcode12_ResFinder_results_tab.txt


In [113]:
updated_pass_resfinder = {}
for name,df in resfinder_DFs.items():
    arscan = []
    for gene in df['Resistance gene']:
        subset =resfinder_arscan_top.loc[resfinder_arscan_top['query_name'] == str(gene)]
        if len(subset) > 0:
            arscan.append(subset.iloc[0]['target_name'])
        else:
            arscan.append('N/A')
    df['ARScan_Term'] = arscan
    updated_pass_resfinder[name] = df

In [114]:
os.makedirs('Filtered_Resfinder_Outputs', exist_ok=True)
for name,df in updated_pass_resfinder.items():
    filename = name + '_filtered_resfinder.csv'
    df.to_csv('Filtered_Resfinder_Outputs/'+filename, index=False)

In [115]:
resfinder_summary = pd.DataFrame()
all_arscan_terms = []
for name,df in updated_pass_resfinder.items():
    for term in df['ARScan_Term']:
        all_arscan_terms.append(term)
unique_all_arscan_terms = list(set(all_arscan_terms))
resfinder_summary['ARScan_Term'] = unique_all_arscan_terms
for name,df in updated_pass_resfinder.items():
    count_data = []
    for term in resfinder_summary['ARScan_Term']:  
        subset = df.loc[df['ARScan_Term'] == term]
        term_count =len(subset)
        count_data.append(term_count)
    resfinder_summary[name] = count_data

In [116]:
Resistance = []
Class = []
AMR_Def = []
Res_Mec = []
Uniprot = []
for term in resfinder_summary['ARScan_Term']:
    if term == 'N/A':
        Resistance.append('N/A')
        Class.append('N/A')
        AMR_Def.append('N/A')
        Res_Mec.append('N/A')
        Uniprot.append('N/A')
    else:
        real_term = term.split('.')[0]
        subset = ARScan_reference_df.loc[ARScan_reference_df['BacARscan ID'] == real_term]
        if len(subset) > 0:
            Resistance.append(subset.iloc[0]['Antibiotic Resistant'])
            Class.append(subset.iloc[0]['Antibiotic Class'])
            AMR_Def.append(subset.iloc[0]['AMR Definition'])
            Res_Mec.append(subset.iloc[0]['Resistance Mechanism'])
            Uniprot.append(subset.iloc[0]['Corresponding Uniprot ID'])
        else:
            print('problem')
            print(subset)
            print(real_term)
            Resistance.append('N/A')
            Class.append('N/A')
            AMR_Def.append('N/A')
            Res_Mec.append('N/A')
            Uniprot.append('N/A')
resfinder_summary['Antibiotic_Resistant'] = Resistance
resfinder_summary['Antibiotic_class'] = Class
resfinder_summary['AMR_Definition'] = AMR_Def
resfinder_summary['Resistance_Mechanism'] = Res_Mec
resfinder_summary['Corresponding_Uniprot ID'] = Uniprot

In [117]:
resfinder_summary.to_csv('Resfinder_BacARScan_summary.csv', index=False)