In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from copy import deepcopy
import time
import random
from scipy import stats
from scipy.stats import mode
import seaborn as sns
import warnings
from collections import defaultdict
warnings.filterwarnings("ignore")
from scipy.stats import chisquare

## Extract potentiator hits for all antibiotics
#### Exclude hits with two non-related antibiotics -> these go in co-linearity section

In [3]:
# Merge sequential and simultaneous hits into a single table

all_hits_sequential = pd.read_csv("output/04A.pair_annotation/results_allpairs_annotation_withcategories.csv", index_col=0)
all_simultaneous = pd.read_csv("output/04B.pair_annotation/results_allpairs_annotation_withcategories.csv", index_col=0)

sequential_merged = all_hits_sequential.merge(all_simultaneous[["position_i", "position_j", "pval_beta_i_on_j", 'pval_beta_i_on_j_BH_sig']], 
                                                on=["position_i", "position_j"], how="left", suffixes=["_sequential", "_simultaneous"])

simultaneous_merged = all_simultaneous.merge(all_hits_sequential[["position_i", "position_j", "pval_beta_i_on_j", 'pval_beta_i_on_j_BH_sig']], 
                                                on=["position_i", "position_j"], how="left", suffixes=[ "_simultaneous", "_sequential"])

all_hits = pd.concat([simultaneous_merged, sequential_merged])
all_hits = all_hits.drop_duplicates(subset=["position_i", "position_j"]).reset_index(drop=True)



In [4]:
known_antibiotic_first = all_hits.dropna(subset=["known_drug_i"]).query("pval_beta_i_on_j_BH_sig_sequential or pval_beta_i_on_j_BH_sig_simultaneous")

known_antibiotic_first['num_dependent'] = known_antibiotic_first.n_muts_i_on_branch_j + known_antibiotic_first.n_muts_same_branch
know_ab_first = known_antibiotic_first.loc[known_antibiotic_first.possible_drug_i.isnull(), :]
know_ab_first.sort_values('num_dependent', ascending=False).head(40)

Unnamed: 0,Unnamed: 0.1,position_i,gene_name_i,position_j,gene_name_j,n_mutations_i,n_mutations_j,n_branches_with_i,n_branches_with_j,n_muts_i_on_branch_j,...,in_antigen_epitope_j,in_antigen_protein_j,in_known_ABR_position_j,known_drug_j,in_possible_ABR_gene_j,possible_drug_j,annotation,pval_beta_i_on_j_sequential,pval_beta_i_on_j_BH_sig_sequential,num_dependent
679354,119471,1673425,,2338994,Rv2082,752,19,3817,32513,483,...,False,False,False,,False,,known_AB_second,1.979419e-25,True,483
677734,89080,1673425,,1340208,PPE18,752,29,3817,21626,369,...,False,True,False,,False,,known_AB_second,2.613815e-25,True,370
677498,81363,1673425,,340132,PPE3,752,134,3817,21827,341,...,False,True,False,,False,,known_AB_second,1.576695e-14,True,344
61929,18010,1673425,,3883626,Rv3466,752,405,3817,19374,316,...,False,False,False,,False,,known_AB_second,3.932368e-14,True,324
769002,1171597,1673425,,2338990,Rv2082,752,17,3817,16336,281,...,False,False,False,,False,,known_AB_second,1.788039e-16,True,281
75298,100874,1673425,,3894732,PPE60,752,59,3817,17091,214,...,False,True,False,,False,,known_AB_second,0.2167253,False,219
75457,102748,1673425,,3446699,Rv3081,752,81,3817,16441,204,...,False,False,False,,False,,known_AB_second,0.2750482,False,211
78372,136110,1673425,,1722228,pks5,752,8,3817,16173,202,...,False,False,False,,False,,known_AB_second,0.237247,False,205
74053,85353,1673425,,2197065,Rv1945,752,61,3817,8507,197,...,False,True,False,,False,,known_AB_second,3.5082210000000003e-22,True,200
66180,36598,1673425,,3730411,PPE54,752,347,3817,15646,164,...,False,True,False,,False,,known_AB_second,0.9748129,False,179


In [5]:
def _process_known_hits(input_list):
    """
    splits the possible associated drugs (string separated by ,) into a list
    """
    output_list = []
    for item in input_list:
        #print(item)
        if type(item) != str:
            output_list.append(np.nan)
        else:
            splitted = item.split(",")
            exclusive= list(set(splitted))
            output_list.append(exclusive)
    return output_list

## Note! This is code recycling, actually the table contains hits where the known antibiotic resistance mutation was second
known_antibiotic_first = all_hits.dropna(subset=["known_drug_i"]).query("pval_beta_i_on_j_BH_sig_sequential or pval_beta_i_on_j_BH_sig_simultaneous")

known_antibiotic_first["known_drug_i"] = _process_known_hits(known_antibiotic_first.known_drug_i)
known_antibiotic_first["possible_drug_i"] = _process_known_hits(known_antibiotic_first.possible_drug_i)

known_antibiotic_first["known_drug_j"] = _process_known_hits(known_antibiotic_first.known_drug_j)
known_antibiotic_first["possible_drug_j"] = _process_known_hits(known_antibiotic_first.possible_drug_j)

print(len(known_antibiotic_first))

3704


In [6]:
between_antibiotic_indices = []
antibiotic_indices = []

known_antibiotic_first["within_same_drug"] = False

for idx, row in known_antibiotic_first.iterrows():
    
    set_of_known_first = set(row.known_drug_i)
    set_of_known_second = set()
    known_drug_second = False
    
    if type(row.known_drug_j) is list:
        set_of_known_second= set(row.known_drug_j)
        known_drug_second = True
        
    elif type(row.possible_drug_j) is list:
        set_of_known_second= set(row.possible_drug_j)
        known_drug_second = True
        
    if not known_drug_second:
        antibiotic_indices.append(idx)
    
    elif len(set_of_known_first.intersection(set_of_known_second)) > 0:
        antibiotic_indices.append(idx)
        known_antibiotic_first.loc[idx, "within_same_drug"] = True
        
    else:
        between_antibiotic_indices.append(idx)
        
between_antibiotic_hits = known_antibiotic_first.loc[between_antibiotic_indices]
antibiotic_hits = known_antibiotic_first.loc[antibiotic_indices]
    

## Compute per-drug output tables

In [7]:
## These will contain all hits except the between-antibiotic hits

## First get the list of all names of drugs in the dataset
drug_list = []
for idx, row in antibiotic_hits.iterrows():
    drugs1 = row.possible_drug_i
    drugs2 = row.possible_drug_j
    
    if type(drugs1) != list:
        drugs1= []
    if type(drugs2) != list:
        drugs2=[]
    
    for drug in drugs1 + drugs2:
        drug_list.append(drug)

drug_list = list(set(drug_list))

## Then create an empty df for each
df_dict = {drug:pd.DataFrame() for drug in drug_list}

## now concatenate each row to the appropriate drug dataframe
for idx, row in antibiotic_hits.iterrows():
    drugs1 = row.possible_drug_i
    drugs2 = row.possible_drug_j
    
    if type(drugs1) != list:
        drugs1= []
    if type(drugs2) != list:
        drugs2=[]
    
    for drug in drugs1:
        df_dict[drug] = df_dict[drug].append(row)
        
    for drug in drugs2:
        df_dict[drug] = df_dict[drug].append(row)
        
for drug, df in df_dict.items():
    print(drug, len(df))
    df.sort_values("n_muts_i_on_branch_j", ascending=False).to_csv(f"output/05.antibiotic/{drug}_potentiator_antibiotic_hits.csv")
    df.query("pval_beta_i_on_j_BH_sig_sequential != 0").query("pval_beta_i_on_j_BH_sig_sequential").sort_values(
        "n_muts_i_on_branch_j", ascending=False
    ).to_csv(f"output/05.antibiotic/{drug}_potentiator_antibiotic_hits_sequential.csv")

PZA 872
RIF 426
EMB 257
ETH 172
LZD 194
MXF 239
AMI 339
LEV 239
KAN 335
STM 389
INH 127
CAP 238
