In [1]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import warnings
import numpy as np
warnings.filterwarnings("ignore")

In [2]:
files = glob.glob("analysis_potentiators/*hit_table.csv")

In [3]:
# Objective: drug, number hits, number non-cross hits, number known hits, number unknown hits

outlist = []

combined_significant_hits = pd.DataFrame()


for f in files:
    d = pd.read_csv(f, index_col=0, dtype={'significant': bool, "rs": str, "known_position":str,"test_position":str, 'cross':bool})
    drug = f.split("analysis_potentiators/")[-1].split("_hit_table.csv")[0]
    print(drug)
    
    if drug=="LEVOFLOXACIN":
        continue

    n_unique_test = len(d.test_position.unique())
    n_unique_known = len(d.known_position.unique())
    
    test_positions = d.query("rs==test_position or cross")
    n_unique_sequential = len(test_positions.query("type=='sequential'").test_position.unique())
    n_unique_potentiator = len(test_positions.query("type=='potentiator'").test_position.unique())
    
    sig = test_positions.query("significant")
    #break
    n_unique_test_sig =  len(sig.test_position.unique())

    n_unique_sequential_sig = len(sig.query("type=='sequential'").test_position.unique())
    n_unique_potentiator_sig = len(sig.query("type=='potentiator'").test_position.unique())

    n_unique_known_sig = len(d.query("significant and rs==known_position").known_position.unique())
    
    sig["drug"] = drug
    combined_significant_hits = pd.concat([combined_significant_hits, sig])
    
    sig["percent_joint"] = sig['N_cross']/sig["N_first"]

    outlist.append([
        drug,  n_unique_known, n_unique_known_sig, n_unique_test, n_unique_test_sig, 
        n_unique_sequential, n_unique_sequential_sig, n_unique_potentiator, n_unique_potentiator_sig,])
#    break
data = pd.DataFrame(outlist, columns=["drug", "n_unique_known", "n_unique_known_sig", "n_unique_test", "n_unique_test_sig",
                                      "n_unique_sequential", "n_unique_sequential_sig", 
                                      "n_unique_potentiator", "n_unique_potentiator_sig"])

data.to_csv("analysis_potentiators/N_hits_per_drug.csv")
data

KANAMYCIN
AMIKACIN
CAPREOMYCIN
MOXIFLOXACIN
ETHIONAMIDE
ISONIAZID
STREPTOMYCIN
PYRAZINAMIDE
RIFAMPICIN
ETHAMBUTOL


Unnamed: 0,drug,n_unique_known,n_unique_known_sig,n_unique_test,n_unique_test_sig,n_unique_sequential,n_unique_sequential_sig,n_unique_potentiator,n_unique_potentiator_sig
0,KANAMYCIN,10,3,152,25,16,1,133,24
1,AMIKACIN,10,1,159,24,16,0,139,24
2,CAPREOMYCIN,7,3,146,20,15,0,127,20
3,MOXIFLOXACIN,9,6,98,37,3,0,93,37
4,ETHIONAMIDE,20,3,83,7,16,1,60,6
5,ISONIAZID,7,3,178,7,88,3,96,4
6,STREPTOMYCIN,27,4,260,35,72,0,195,35
7,PYRAZINAMIDE,86,5,174,6,35,0,124,6
8,RIFAMPICIN,21,6,256,17,63,1,199,17
9,ETHAMBUTOL,11,4,182,19,100,2,81,17


## Add the number of phenotyped isolates per drug

In [5]:
drug_to_abbrev = {    
    "AMIKACIN":"AMI",
     "CAPREOMYCIN":"CAP",
     "ETHAMBUTOL":"EMB",
     "ETHIONAMIDE":"ETA",
     "ISONIAZID":"INH",
     "KANAMYCIN":"KAN",
     "LEVOFLOXACIN":"LEVO",
     "MOXIFLOXACIN":"MOXI",
     "PYRAZINAMIDE":"PZA",
     "RIFAMPICIN":"RIF",
     "STREPTOMYCIN":"STR"
}

mic_data = pd.read_csv("MIC_combined_data.csv")
data["N"] = np.nan

for idx,row in data.iterrows():
    drug = row.drug
    abbrev = drug_to_abbrev[drug]
    data.loc[idx, "N"] = sum(mic_data[f"{abbrev}_midpoint"] > -1)
    
data["percent_known_with_influence"] = (data.n_unique_known_sig )/data.n_unique_known
data["percent_test_with_influence"] = (data.n_unique_test_sig) / data.n_unique_test
data = data.query('drug != "LEVOFLOXACIN"')
data.median()

n_unique_known                    10.500000
n_unique_known_sig                 3.500000
n_unique_test                    166.500000
n_unique_test_sig                 19.500000
n_unique_sequential               25.500000
n_unique_sequential_sig            0.500000
n_unique_potentiator             125.500000
n_unique_potentiator_sig          18.500000
N                               1389.000000
percent_known_with_influence       0.292857
percent_test_with_influence        0.119505
dtype: float64

In [6]:
data.sort_values("n_unique_known", ascending=False).to_csv("Table1.csv")

In [7]:
# For each drug, what are the significant linear hits? 

mutation_identity = pd.read_csv("../output/03.annotation/snps_with_all_annotation.csv")
allinfo = mutation_identity

In [8]:
combined_significant_hits

Unnamed: 0,index,rs,first_mutation,second_mutation,n_miss,af,beta,p_wald,se,N_first,...,test_position,second_gene_name,known_position,first_gene_name,type,known,cross,color,significant,drug
39,39,732110.0,1473246.0,732110.0,45,0.003,1.161270,8.044625e-03,0.437459,170.0,...,732110.0,hadA,1473246.0,rrs,sequential,False,False,blue,True,KANAMYCIN
40,40,732110.0_1473246.0,1473246.0,732110.0,73,0.001,1.748137,9.378974e-03,0.671824,170.0,...,732110.0,hadA,1473246.0,rrs,sequential,False,True,red,True,KANAMYCIN
1,1,1473246.0_1287112.0,1287112.0,1473246.0,31,0.072,2.885623,7.620236e-177,0.085429,1208.0,...,1287112.0,rrs,1473246.0,,potentiator,False,True,blue,True,KANAMYCIN
2,2,1473246.0_2626108.0,2626108.0,1473246.0,41,0.069,2.824540,2.292357e-156,0.090923,1199.0,...,2626108.0,rrs,1473246.0,esxO,potentiator,False,True,blue,True,KANAMYCIN
3,3,1473246.0_2626191.0,2626191.0,1473246.0,54,0.067,2.751289,3.730451e-133,0.098416,1187.0,...,2626191.0,rrs,1473246.0,,potentiator,False,True,blue,True,KANAMYCIN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,29,4247429.0_1722228.0,1722228.0,4247429.0,25,0.021,0.428620,1.520378e-03,0.134928,222.0,...,1722228.0,embB,4247429.0,pks5,potentiator,False,True,blue,True,ETHAMBUTOL
27,31,4248003.0_2626189.0,2626189.0,4248003.0,47,0.014,0.419972,2.157274e-03,0.136662,1426.0,...,2626189.0,embB,4248003.0,,potentiator,False,True,blue,True,ETHAMBUTOL
28,32,4248003.0_2626191.0,2626191.0,4248003.0,46,0.014,0.417474,2.290636e-03,0.136653,1429.0,...,2626191.0,embB,4248003.0,,potentiator,False,True,blue,True,ETHAMBUTOL
29,33,4247730.0_27455.0,27455.0,4247730.0,54,0.001,1.410197,4.182436e-03,0.491614,58.0,...,27455.0,embB,4247730.0,,potentiator,False,True,blue,True,ETHAMBUTOL


In [17]:
combined_significant_hits["test_position"] = combined_significant_hits["test_position"].astype(float).astype(int)
merged = allinfo.merge(combined_significant_hits, how="inner", left_on="pos", right_on='test_position')
cols = ['rs', 'test_position', 'known_position', 'beta', 'p_wald',
        'N_i', 'N_j', 'N_ij', 'first_mutation', 'second_mutation', 'first_gene_name_i', 'second_gene_name_i', 
        'drug',
        'SNP type', 'AA change']

all_hits = pd.DataFrame()

for drug in combined_significant_hits.drug.unique():
    print(drug)
    of_interest = merged.query('drug==@drug').drop_duplicates()
    #print(of_interest)
    all_hits = pd.concat([all_hits, of_interest])
    #print(merged.query('drug_x==@drug')[cols].drop_duplicates())


KANAMYCIN
AMIKACIN
CAPREOMYCIN
MOXIFLOXACIN
ETHIONAMIDE
ISONIAZID
STREPTOMYCIN
PYRAZINAMIDE
RIFAMPICIN
ETHAMBUTOL


In [10]:
all_hits.to_csv("combined_GEMMA_results_potentiators.csv")

# Construct a table sumamrizing heritability

In [11]:
pve_all = pd.read_csv("PVE_all_MAF0p001.csv")
pve_all = pve_all.query("drug != 'LEVOFLOXACIN'")


In [12]:
pve_homoplasy = pd.read_csv("PVE_homoplasic_sites..csv", index_col=0)
pve_homoplasy = pve_homoplasy.query("drug != 'LEVOFLOXACIN'")
pve_homoplasy.median()

pve          0.641346
se_pve       0.033014
N_snps    1967.000000
dtype: float64

In [13]:
pve_known = pd.read_csv("PVE_ABR_possible.csv", index_col=0)
pve_known = pve_known.query("drug != 'LEVOFLOXACIN'")
pve_known.median()

pve        0.317487
se_pve     0.046780
N_snps    84.500000
dtype: float64

In [14]:
pve_pairs = pd.read_csv("PVE_ABR_dep_sites.csv", index_col=0)
pve_pairs = pve_pairs.query("drug != 'LEVOFLOXACIN'")
pve_pairs.median()

pve         0.509202
se_pve      0.040565
N_snps    319.500000
dtype: float64

In [15]:
pve_pairs_potentiators = pd.read_csv("PVE_ABR_dep_sites_with_potentiators.csv", index_col=0)
pve_pairs_potentiators = pve_pairs_potentiators.query("drug != 'LEVOFLOXACIN'")
pve_pairs_potentiators.median()

pve         0.562946
se_pve      0.042540
N_snps    609.500000
dtype: float64

In [16]:
heritability_table = pve_homoplasy.merge(pve_known, on="drug", suffixes=["_homoplastic","_antibiotic_sites"])
heritability_table = heritability_table.merge(pve_pairs, on="drug")
heritability_table = heritability_table.merge(pve_pairs_potentiators, on="drug", suffixes =["_pairs", "_pairs_and_potentiators"])
heritability_table.to_csv("heritability_combined_table.csv")