In [1]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import warnings
import numpy as np
warnings.filterwarnings("ignore")

In [2]:
files = glob.glob("analysis_potentiators/*hit_table.csv")

In [3]:
# Objective: drug, number hits, number non-cross hits, number known hits, number unknown hits

outlist = []

combined_significant_hits = pd.DataFrame()


for f in files:
    d = pd.read_csv(f, index_col=0, dtype={'significant': bool, "rs": str, "known_position":str,"test_position":str, 'cross':bool})
    d = d.drop_duplicates("rs")
    drug = f.split("analysis_potentiators/")[-1].split("_hit_table.csv")[0]
    print(drug)
    
    if drug=="LEVOFLOXACIN":
        continue

    n_unique_test = len(d.test_position.unique())
    n_unique_known = len(d.known_position.unique())
    
    test_positions = d.query("rs==test_position or cross")
    n_unique_sequential = len(test_positions.query("type=='sequential'").test_position.unique())
    n_unique_potentiator = len(test_positions.query("type=='potentiator'").test_position.unique())
    
    sig = test_positions.query("significant")
    #break
    n_unique_test_sig =  len(sig.test_position.unique())

    n_unique_sequential_sig = len(sig.query("type=='sequential'").test_position.unique())
    n_unique_potentiator_sig = len(sig.query("type=='potentiator'").test_position.unique())

    n_unique_known_sig = len(d.query("significant and rs==known_position").known_position.unique())
    
    sig["drug"] = drug
    combined_significant_hits = pd.concat([combined_significant_hits, sig])
    
    sig["percent_joint"] = sig['N_cross']/sig["N_first"]

    outlist.append([
        drug,  n_unique_known, n_unique_known_sig, n_unique_test, n_unique_test_sig, 
        n_unique_sequential, n_unique_sequential_sig, n_unique_potentiator, n_unique_potentiator_sig,])
    
#    break
data = pd.DataFrame(outlist, columns=["drug", "n_unique_known", "n_unique_known_sig", "n_unique_test", "n_unique_test_sig",
                                      "n_unique_sequential", "n_unique_sequential_sig", 
                                      "n_unique_potentiator", "n_unique_potentiator_sig"])

data.to_csv("analysis_potentiators/N_hits_per_drug.csv")
data

KANAMYCIN
AMIKACIN
CAPREOMYCIN
MOXIFLOXACIN
ETHIONAMIDE
ISONIAZID
STREPTOMYCIN
PYRAZINAMIDE
RIFAMPICIN
ETHAMBUTOL


Unnamed: 0,drug,n_unique_known,n_unique_known_sig,n_unique_test,n_unique_test_sig,n_unique_sequential,n_unique_sequential_sig,n_unique_potentiator,n_unique_potentiator_sig
0,KANAMYCIN,10,3,142,26,22,1,119,25
1,AMIKACIN,11,1,157,16,26,0,133,16
2,CAPREOMYCIN,9,4,136,38,21,2,116,37
3,MOXIFLOXACIN,9,6,92,27,1,0,88,27
4,ETHIONAMIDE,24,3,95,7,19,3,76,6
5,ISONIAZID,7,3,270,8,160,3,122,5
6,STREPTOMYCIN,32,5,317,36,109,0,241,36
7,PYRAZINAMIDE,88,3,232,6,38,0,187,6
8,RIFAMPICIN,22,6,388,14,178,1,271,14
9,ETHAMBUTOL,11,3,221,17,127,2,106,15


## Add the number of phenotyped isolates per drug

In [4]:
drug_to_abbrev = {    
    "AMIKACIN":"AMI",
     "CAPREOMYCIN":"CAP",
     "ETHAMBUTOL":"EMB",
     "ETHIONAMIDE":"ETA",
     "ISONIAZID":"INH",
     "KANAMYCIN":"KAN",
     "LEVOFLOXACIN":"LEVO",
     "MOXIFLOXACIN":"MOXI",
     "PYRAZINAMIDE":"PZA",
     "RIFAMPICIN":"RIF",
     "STREPTOMYCIN":"STR"
}

mic_data = pd.read_csv("MIC_combined_data.csv")
data["N"] = np.nan

for idx,row in data.iterrows():
    drug = row.drug
    abbrev = drug_to_abbrev[drug]
    data.loc[idx, "N"] = sum(mic_data[f"{abbrev}_midpoint"] > -1)
    
data["percent_known_with_influence"] = (data.n_unique_known_sig ) /data.n_unique_known
data["percent_test_with_influence"] = (data.n_unique_test_sig) / data.n_unique_test
data = data.query('drug != "LEVOFLOXACIN"')
data.median()

n_unique_known                    11.000000
n_unique_known_sig                 3.000000
n_unique_test                    189.000000
n_unique_test_sig                 16.500000
n_unique_sequential               32.000000
n_unique_sequential_sig            1.000000
n_unique_potentiator             120.500000
n_unique_potentiator_sig          15.500000
N                               1389.000000
percent_known_with_influence       0.272727
percent_test_with_influence        0.089417
dtype: float64

In [5]:
data

Unnamed: 0,drug,n_unique_known,n_unique_known_sig,n_unique_test,n_unique_test_sig,n_unique_sequential,n_unique_sequential_sig,n_unique_potentiator,n_unique_potentiator_sig,N,percent_known_with_influence,percent_test_with_influence
0,KANAMYCIN,10,3,142,26,22,1,119,25,1348.0,0.3,0.183099
1,AMIKACIN,11,1,157,16,26,0,133,16,1346.0,0.090909,0.101911
2,CAPREOMYCIN,9,4,136,38,21,2,116,37,1176.0,0.444444,0.279412
3,MOXIFLOXACIN,9,6,92,27,1,0,88,27,1217.0,0.666667,0.293478
4,ETHIONAMIDE,24,3,95,7,19,3,76,6,1430.0,0.125,0.073684
5,ISONIAZID,7,3,270,8,160,3,122,5,1853.0,0.428571,0.02963
6,STREPTOMYCIN,32,5,317,36,109,0,241,36,1835.0,0.15625,0.113565
7,PYRAZINAMIDE,88,3,232,6,38,0,187,6,1266.0,0.034091,0.025862
8,RIFAMPICIN,22,6,388,14,178,1,271,14,1905.0,0.272727,0.036082
9,ETHAMBUTOL,11,3,221,17,127,2,106,15,1849.0,0.272727,0.076923


In [6]:
3/11

0.2727272727272727

In [7]:
data.sort_values("n_unique_known", ascending=False).to_csv("Table1.csv")

In [8]:
# For each drug, what are the significant linear hits? 

mutation_identity = pd.read_csv("../output/03.annotation/snps_with_all_annotation.csv")
allinfo = mutation_identity

In [24]:
combined_significant_hits.query("not known and type=='sequential'").to_csv("snps_with_influence.csv")

In [10]:
combined_significant_hits["test_position"] = combined_significant_hits["test_position"].astype(float).astype(int)
merged = allinfo.merge(combined_significant_hits, how="inner", left_on="pos", right_on='test_position')
cols = ['rs', 'test_position', 'known_position', 'beta', 'p_wald',
        'N_i', 'N_j', 'N_ij', 'first_mutation', 'second_mutation', 'first_gene_name_i', 'second_gene_name_i', 
        'drug',
        'SNP type', 'AA change']

all_hits = pd.DataFrame()

for drug in combined_significant_hits.drug.unique():
    print(drug)
    of_interest = merged.query('drug==@drug').drop_duplicates()
    #print(of_interest)
    all_hits = pd.concat([all_hits, of_interest])
    #print(merged.query('drug_x==@drug')[cols].drop_duplicates())


KANAMYCIN
AMIKACIN
CAPREOMYCIN
MOXIFLOXACIN
ETHIONAMIDE
ISONIAZID
STREPTOMYCIN
PYRAZINAMIDE
RIFAMPICIN
ETHAMBUTOL


In [11]:
all_hits.to_csv("combined_GEMMA_results_potentiators.csv")

## Table of significant consequential mutations

In [12]:
all_hits.query("type=='sequential'").beta

32     1.161270
33     1.748137
2      1.793741
3      1.286061
37     2.190674
93     0.987378
96     0.806287
100    2.194741
101    1.331669
94     1.082415
131    1.372000
290   -5.210318
291   -5.201780
20     1.577086
95     0.297099
296    0.391100
Name: beta, dtype: float64

# Construct a table sumamrizing heritability

In [13]:
pve_all = pd.read_csv("PVE_all_MAF0p001.csv")
pve_all = pve_all.query("drug != 'LEVOFLOXACIN'")


In [14]:
! cp ../../../newRepo/DependentMutations/GEMMA/PVE_homoplasic_sites..csv .
pve_homoplasy = pd.read_csv("PVE_homoplasic_sites..csv", index_col=0)
pve_homoplasy = pve_homoplasy.query("drug != 'LEVOFLOXACIN'")
pve_homoplasy.median()

pve          0.641346
se_pve       0.033014
N_snps    1967.000000
dtype: float64

In [16]:
pve_pairs = pd.read_csv("PVE_ABR_dep_sites.csv", index_col=0)
pve_pairs = pve_pairs.query("drug != 'LEVOFLOXACIN'")
pve_pairs.median()

pve         0.528296
se_pve      0.038464
N_snps    391.500000
dtype: float64

In [21]:
! cp ../../../newRepo/DependentMutations/GEMMA/PVE_ABR_possible.csv .
pve_known = pd.read_csv("PVE_ABR_possible.csv", index_col=0)
pve_known = pve_known.query("drug != 'LEVOFLOXACIN'")
pve_known.median()

pve        0.317487
se_pve     0.046780
N_snps    84.500000
dtype: float64

In [17]:
pve_pairs_potentiators = pd.read_csv("PVE_ABR_dep_sites_with_potentiators.csv", index_col=0)
pve_pairs_potentiators = pve_pairs_potentiators.query("drug != 'LEVOFLOXACIN'")
pve_pairs_potentiators.median()

pve          0.538426
se_pve       0.042285
N_snps    1061.000000
dtype: float64

In [22]:
heritability_table = pve_homoplasy.merge(pve_known, on="drug", suffixes=["_homoplastic","_antibiotic_sites"])
heritability_table = heritability_table.merge(pve_pairs, on="drug")
heritability_table = heritability_table.merge(pve_pairs_potentiators, on="drug", suffixes =["_pairs", "_pairs_and_potentiators"])
heritability_table.to_csv("heritability_combined_table.csv")

In [23]:
heritability_table

Unnamed: 0,drug,pve_homoplastic,se_pve_homoplastic,N_snps_homoplastic,pve_antibiotic_sites,se_pve_antibiotic_sites,N_snps_antibiotic_sites,pve_pairs,se_pve_pairs,N_snps_pairs,pve_pairs_and_potentiators,se_pve_pairs_and_potentiators,N_snps_pairs_and_potentiators
0,AMIKACIN,0.695975,0.028381,2013.0,0.495295,0.050414,102.0,0.559449,0.038182,393.0,0.531447,0.039248,1006.0
1,CAPREOMYCIN,0.54397,0.046517,1665.0,0.281788,0.052627,93.0,0.398981,0.050106,287.0,0.408906,0.049987,708.0
2,ETHAMBUTOL,0.591797,0.035415,2389.0,0.27336,0.043146,76.0,0.473061,0.038747,753.0,0.448436,0.043514,1196.0
3,ETHIONAMIDE,0.545765,0.04519,1920.0,0.11815,0.032392,61.0,0.258935,0.049685,216.0,0.455398,0.051624,520.0
4,ISONIAZID,0.709656,0.027447,2393.0,0.218918,0.042192,69.0,0.585139,0.036685,855.0,0.66962,0.033992,1116.0
5,KANAMYCIN,0.698454,0.028844,1921.0,0.507033,0.05652,68.0,0.627253,0.037391,355.0,0.591235,0.040196,963.0
6,MOXIFLOXACIN,0.69266,0.032958,1761.0,0.323647,0.065147,28.0,0.435108,0.067749,46.0,0.545404,0.051583,558.0
7,PYRAZINAMIDE,0.62214,0.043849,1727.0,0.645211,0.052297,157.0,0.641774,0.050956,390.0,0.599488,0.055374,1891.0
8,RIFAMPICIN,0.660552,0.029532,2437.0,0.311326,0.036654,152.0,0.614748,0.032149,999.0,0.593569,0.034536,1782.0
9,STREPTOMYCIN,0.591107,0.03307,2397.0,0.327495,0.039179,211.0,0.497144,0.036623,989.0,0.433276,0.041056,1723.0
