In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 125
import seaborn as sns
from Bio import SeqIO, Seq
import scipy.stats as st
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import glob, os, yaml, subprocess, itertools, sparse, pickle

who_variants = pd.read_csv("/n/data1/hms/dbmi/farhat/Sanjana/MIC_data/WHO_resistance_variants_all.csv")
pd.set_option('display.float_format', lambda x: '%.4f' % x)
samples_summary = pd.read_csv("../data/samples_summary.csv")

In [4]:
dlm = pd.read_csv("/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue/Delamanid/tiers=1/phenos=WHO/dropAF_noSyn/model_analysis.csv")

In [13]:
dlm

Unnamed: 0,orig_variant,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,genome_index,confidence_WHO_2021,Odds_Ratio,OR_LB,OR_UB
0,ddn_lof,0.2970,0.1632,0.3735,0.0000,0.0000,0.0000,,,1.3458,1.1773,1.4528
1,fgd1_lof,0.1569,-0.0000,0.2102,0.0041,0.0428,0.2570,,,1.1699,1.0000,1.2339
2,ddn_p.Val127Phe,0.1557,-0.0246,0.2248,0.0023,0.0283,0.1414,,,1.1685,0.9757,1.2520
3,fbiB_p.Lys448Arg,0.1363,-0.0291,0.2688,0.0445,0.1105,1.0000,3642877.0000,3) Uncertain significance,1.1460,0.9713,1.3085
4,fbiC_lof,0.1302,-0.0535,0.2157,0.0367,0.0989,1.0000,,,1.1390,0.9479,1.2408
...,...,...,...,...,...,...,...,...,...,...,...,...
57,fbiC_p.Val41Met,-0.0466,-0.0827,0.0000,0.0245,0.0843,1.0000,1303051.0000,3) Uncertain significance,0.9544,0.9206,1.0000
58,ddn_p.Glu83Asp,-0.0598,-0.0947,-0.0201,0.0010,0.0198,0.0595,3987092.0000,3) Uncertain significance,0.9419,0.9096,0.9801
59,fbiC_c.-32A>G,-0.0646,-0.0976,-0.0191,0.0006,0.0187,0.0374,1302899.0000,3) Uncertain significance,0.9374,0.9071,0.9810
60,fbiA_p.Arg304Gln,-0.0796,-0.1329,0.0000,0.0465,0.1109,1.0000,3641453.0000,3) Uncertain significance,0.9235,0.8755,1.0000


In [7]:
who_variants.query("drug=='DLM'").confidence.unique()

array(['3) Uncertain significance', '2) Assoc w R - Interim'],
      dtype=object)

In [8]:
who_variants.query("drug=='DLM' & confidence=='2) Assoc w R - Interim'")

Unnamed: 0.1,Unnamed: 0,drug,genome_index,confidence,gene,variant
5248,3804,DLM,3986989,2) Assoc w R - Interim,ddn,ddn_L49P


In [9]:
samples_summary.query("Tier1_LOF==0")

Unnamed: 0,Drug,Genos,Binary_Phenos,MICs,Lineages,SNP_Matrix,Tier1_LOF,Tier1_MultiInframe,Tier2_LOF,Tier2_MultiInframe
4,Delamanid,11803,11803,11353,11491,11803,0,0,0,0
9,Levofloxacin,27576,27576,11447,26909,27576,0,0,6,0
10,Linezolid,18010,18010,12626,17062,18010,0,0,0,0
11,Moxifloxacin,22783,22783,12753,22116,22783,0,0,8,0


In [27]:
add_path = "/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue/Delamanid/tiers=1/phenos=WHO/dropAF_noSyn"

In [28]:
add_path

'/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue/Delamanid/tiers=1/phenos=WHO/dropAF_noSyn'

In [29]:
[2 if "+2" in add_path else 1][0]

1

In [30]:
["ALL" if "ALL" in add_path else "WHO"][0]

'WHO'

In [31]:
int("unpooled" in add_path)

0

In [32]:
int("withSyn" in add_path)

0

In [33]:
["AF" if "encodeAF" in add_path else "drop"][0]

'drop'

# Combined Analysis Files

In [150]:
def final_processing(drug):
    '''
    Functions for processing outputs before sending to everyone else.
    
    1. Remove principal components (will describe them separately)
    2. Add LOF to the predicted_effect column for pooled LOF mutations
    3. Remove genome_index column (should actually do that earlier, but will fix later)
    4. Remove the logistic regression coefficient columns (they will prefer to work with odds ratios)
    5. Any other column renaming or dropping for clarity
    '''
    
    analysis_df = pd.read_csv(f"/n/data1/hms/dbmi/farhat/ye12/who/analysis/{drug}/final_analysis.csv")
    analysis_df.rename(columns={"orig_variant": "mutation", "Tier1_only": "Tier", "WHO_phenos": "Phenos"}, inplace=True)
    
    # remove logReg coefficients. Keep only odds ratios. Remove the other two columns, which were present mainly for me to see
    # if we were picking up many mutations that were in the 2021 mutation catalog
    del analysis_df["genome_index"]
    del analysis_df["confidence_WHO_2021"]
    analysis_df = analysis_df[analysis_df.columns[~analysis_df.columns.str.contains("coef")]]
    
    # remove significant principal components and replace the NaNs in the predicted effect column for the gene loss of functions
    analysis_df = analysis_df.loc[~analysis_df["mutation"].str.contains("PC")]
    analysis_df.loc[analysis_df["mutation"].str.contains("lof"), "predicted_effect"] = "LOF"
    
    # predicted effect should not be NaN for anything. position is NaN only for the pooled LOF mutations
    assert len(analysis_df.loc[pd.isnull(analysis_df["predicted_effect"])]) == 0
    assert len(analysis_df.loc[(~analysis_df["mutation"].str.contains("lof")) & (pd.isnull(analysis_df["position"]))]) == 0
    
    analysis_df["Tier"] = analysis_df["Tier"].map({1: 1, 0: 2})
    analysis_df["Phenos"] = analysis_df["Phenos"].map({1: "WHO", 0: "ALL"})
    
    # reorder columns
    analysis_df = analysis_df[['mutation', 'predicted_effect', 'position', 'Odds_Ratio', 'OR_LB', 'OR_UB', 'pval', 'BH_pval', 'Bonferroni_pval',
       'Num_Isolates', 'Total_Isolates', 'TP', 'FP', 'TN', 'FN', 'Sens_LB', 'Sens', 'Sens_UB', 'Spec_LB', 'Spec', 'Spec_UB', 'PPV', 'PPV_LB', 'PPV_UB',
       'LR+_LB', 'LR+', 'LR+_UB', 'LR-_LB', 'LR-', 'LR-_UB', 'Tier', 'Phenos', 'poolLOF', 'Syn']]
    
    analysis_df[['poolLOF', "Syn"]] = analysis_df[['poolLOF', "Syn"]].astype(int)
        
    return analysis_df

In [151]:
analysis_dir = "/n/data1/hms/dbmi/farhat/ye12/who/analysis"
finished_drugs = []

for drug in os.listdir(analysis_dir):
    
    if os.path.isfile(os.path.join(analysis_dir, drug, "final_analysis.csv")):
        finished_drugs.append(drug)
        # drug_analyses.append(final_processing(drug))

In [152]:
print(finished_drugs)

['Levofloxacin', 'Pyrazinamide', 'Streptomycin', 'Amikacin', 'Clofazimine', 'Linezolid', 'Moxifloxacin', 'Kanamycin', 'Bedaquiline', 'Capreomycin', 'Delamanid', 'Ethionamide']


In [156]:
# those that are actually done
finished_drugs = ['Pyrazinamide', 'Amikacin', 'Clofazimine', 'Linezolid', 'Moxifloxacin', 'Kanamycin', 'Bedaquiline', 'Capreomycin', 'Delamanid']

drug_analyses = {}
for drug in finished_drugs:
    drug_analyses[drug] = final_processing(drug)
    
# write results to an Excel file, where each sheet is named for a drug
with pd.ExcelWriter("Farhat_logReg_analysis.xlsx") as file:
   
    for key, val in drug_analyses.items():
        val.to_excel(file, sheet_name=key, index=False)

In [140]:
drug_analyses[4].query("Tier==2&Phenos=='WHO'")

Unnamed: 0,mutation,predicted_effect,position,Odds_Ratio,OR_LB,OR_UB,pval,BH_pval,Bonferroni_pval,confidence_WHO_2021,...,LR+_LB,LR+,LR+_UB,LR-_LB,LR-,LR-_UB,Tier,Phenos,poolLOF,Syn


In [136]:
drug_analyses[0].query("Tier==1&Phenos=='ALL'")

Unnamed: 0,mutation,predicted_effect,position,Odds_Ratio,OR_LB,OR_UB,pval,BH_pval,Bonferroni_pval,confidence_WHO_2021,...,LR+_LB,LR+,LR+_UB,LR-_LB,LR-,LR-_UB,Tier,Phenos,poolLOF,Syn
1591,panD_c.405G>C,synonymous_variant,4043877,0.9855,0.9794,1.0,0.0368,0.0489,1.0,,...,,inf,,,1.0,,1,ALL,1,1
1665,clpC1_c.1863G>A,synonymous_variant,4038842,0.9852,0.9795,1.0,0.0362,0.049,1.0,,...,,inf,,,1.0,,1,ALL,1,1
1817,clpC1_c.1842G>T,synonymous_variant,4038863;4038860,0.9826,0.9726,1.0,0.04,0.0499,1.0,,...,,inf,,,1.0,,1,ALL,1,1


In [103]:
drug_analyses[0].query("predicted_effect=='synonymous_variant' & ")

Unnamed: 0,mutation,predicted_effect,position,Odds_Ratio,OR_LB,OR_UB,pval,BH_pval,Bonferroni_pval,confidence_WHO_2021,...,PPV_UB,LR+_LB,LR+,LR+_UB,LR-_LB,LR-,LR-_UB,Tier1,WHO_phenos,poolLOF
0,pncA_lof,LOF,,2.2523,2.1759,2.3512,0.0000,0.0000,0.0000,,...,0.9483,37.8260,49.5805,70.8301,0.8640,0.8746,0.8852,1,1,1
1,pncA_p.His57Asp,missense_variant,2289073,1.7228,1.6757,1.7705,0.0000,0.0000,0.0000,1) Assoc w R,...,1.0000,121.1180,317.1196,inf,0.9564,0.9622,0.9677,1,1,1
2,pncA_c.-11A>G,upstream_gene_variant,2289252,1.5396,1.4875,1.5948,0.0000,0.0000,0.0000,1) Assoc w R,...,0.9697,31.1115,53.8721,120.8069,0.9630,0.9683,0.9738,1,1,1
5,pncA_p.His51Asp,missense_variant,2289091,1.3459,1.3089,1.3795,0.0000,0.0000,0.0000,1) Assoc w R,...,1.0000,48.2829,175.7530,inf,0.9865,0.9895,0.9926,1,1,1
6,pncA_p.Val131fs,frameshift,2288850;2288851;2288848;2288852;2288849,1.3420,1.3212,1.3744,0.0000,0.0000,0.0000,,...,1.0000,57.5839,202.4981,inf,0.9845,0.9879,0.9910,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2534,rpsA_p.Ile70Leu,missense_variant,1833749,0.8743,0.8654,0.8848,0.0000,0.0000,0.0000,,...,,,0.0000,,,1.0030,,0,0,1
2535,Rv3236c_p.Val151Ala,missense_variant,3612665,0.8738,0.7786,0.9736,0.0096,0.0329,1.0000,5) Not assoc w R,...,,,0.8145,,,1.0052,,0,0,1
2537,Rv3236c_c.546C>T,synonymous_variant,3612571,0.8573,0.8438,0.8701,0.0000,0.0000,0.0000,,...,,,0.0000,,,1.0066,,0,0,1
2538,Rv1258c_c.1029T>C,synonymous_variant,1406312,0.8513,0.7757,0.9338,0.0004,0.0027,1.0000,,...,,,0.1567,,,1.0202,,0,0,1


In [35]:
moxi_df.loc[pd.isnull(moxi_df["position"])]

Unnamed: 0,mutation,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,confidence_WHO_2021,Odds_Ratio,OR_LB,...,Sens_LB,Sens_UB,Spec_LB,Spec_UB,LR+_LB,LR+_UB,LR-_LB,LR-_UB,predicted_effect,position
19,glpK_lof,0.1,0.0397,0.1618,0.0006,0.0065,1.0,,1.1052,1.0405,...,0.0144,0.0229,0.9851,0.9884,1.0648,1.818,0.9896,0.9991,LOF,
404,Rv2752c_lof,0.0041,-0.0609,0.0741,0.4522,0.5002,1.0,,1.0042,0.9409,...,0.0186,0.028,0.9876,0.9904,1.6353,2.65,0.9827,0.9927,LOF,
437,gyrA_lof,-0.0003,-0.001,0.0,0.147,0.1789,1.0,,0.9997,0.999,...,,,,,,,,,LOF,
539,Rv1129c_lof,-0.0032,-0.0476,0.0212,0.4424,0.4901,1.0,,0.9968,0.9535,...,,,,,,,,,LOF,
969,Rv2477c_lof,-0.0098,-0.0147,0.0,0.0397,0.0615,1.0,,0.9903,0.9854,...,,,,,,,,,LOF,


# Catalog-Based Method Comparison

In [20]:
def get_logReg_summary(out_dir):
    
    model_summary = pd.read_csv(os.path.join(out_dir, "logReg_summary.csv"))
    model_analysis = pd.read_csv(os.path.join(out_dir, "model_analysis.csv"))
    
    summary_sens = model_summary.loc[0, "Sens"]
    summary_spec = model_summary.loc[0, "Spec"]
    
    #max_idx = [np.argmax(model_analysis["Sens"]), np.argmax(model_analysis["Spec"])]
    
    better_variant = model_analysis.query("Sens >= @summary_sens & Spec >= @summary_spec")
    
    if len(better_variant) == 0:
        print("No variants have comparable sensitivity AND specificity")
    else:
        print(better_variant)
    return model_summary, model_analysis
    
    #return model_summary, model_analysis.iloc[max_idx, :][["orig_variant", "coef", "Sens", "Spec", "accuracy", "balanced_accuracy"]]

In [26]:
#out_dir = "/n/data1/hms/dbmi/farhat/ye12/who/analysis/Levofloxacin/tiers=1+2/phenos=ALL/dropAF_withSyn"
out_dir = "/n/data1/hms/dbmi/farhat/ye12/who/analysis/Pyrazinamide/tiers=1+2/phenos=WHO/dropAF_withSyn"

summary, analysis = get_logReg_summary(out_dir)
summary

No variants have comparable sensitivity AND specificity


Unnamed: 0,Sens,Spec,accuracy,balanced_accuracy
0,0.8336,0.9084,0.8951,0.871


In [27]:
analysis.query("Sens > 0.9006")

Unnamed: 0,orig_variant,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,genome_index,confidence_WHO_2021,Odds_Ratio,...,Sens_LB,Sens_UB,Spec_LB,Spec_UB,LR+_LB,LR+_UB,LR-_LB,LR-_UB,accuracy,balanced_accuracy


In [29]:
analysis.loc[analysis["orig_variant"].str.contains("PC")]

Unnamed: 0,orig_variant,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,genome_index,confidence_WHO_2021,Odds_Ratio,...,Sens_LB,Sens_UB,Spec_LB,Spec_UB,LR+_LB,LR+_UB,LR-_LB,LR-_UB,accuracy,balanced_accuracy
35,PC2,0.166,0.0355,0.3034,0.0078,0.0301,1.0,,,1.1806,...,,,,,,,,,,
57,PC0,0.1346,0.0166,0.263,0.014,0.0492,1.0,,,1.144,...,,,,,,,,,,


In [28]:
analysis.query("Spec > 0.9575")

Unnamed: 0,orig_variant,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,genome_index,confidence_WHO_2021,Odds_Ratio,...,Sens_LB,Sens_UB,Spec_LB,Spec_UB,LR+_LB,LR+_UB,LR-_LB,LR-_UB,accuracy,balanced_accuracy
0,pncA_c.-11A>G,0.4088,0.3700,0.4467,0.0000,0.0000,0.0000,2289252.0000,1) Assoc w R,1.5050,...,0.0274,0.0395,0.9993,0.9999,44.4364,284.1349,0.9608,0.9731,0.8278,0.5163
1,pncA_p.His51Asp,0.3308,0.3054,0.3591,0.0000,0.0000,0.0000,2289091.0000,1) Assoc w R,1.3921,...,0.0095,0.0177,0.9998,1.0000,63.1368,inf,0.9823,0.9905,0.8246,0.5067
2,pncA_p.Val131fs,0.2900,0.2684,0.3154,0.0000,0.0000,0.0000,,,1.3365,...,0.0090,0.0165,0.9998,1.0000,54.4333,inf,0.9836,0.9912,0.8245,0.5064
3,pncA_p.Gln141Pro,0.2869,0.2529,0.3273,0.0000,0.0000,0.0000,2288820.0000,1) Assoc w R,1.3323,...,0.0151,0.0243,0.9995,0.9999,35.3919,344.8754,0.9759,0.9851,0.8256,0.5097
4,pncA_p.Leu172Pro,0.2779,0.2524,0.3046,0.0000,0.0000,0.0000,2288727.0000,1) Assoc w R,1.3204,...,0.0062,0.0129,1.0000,1.0000,inf,inf,0.9871,0.9938,0.8239,0.5046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
854,rpsA_p.Ile70Leu,-0.1249,-0.1356,-0.1120,0.0000,0.0000,0.0000,,,0.8826,...,0.0000,0.0000,0.9966,0.9982,0.0000,0.0000,1.0018,1.0034,0.1798,0.5013
855,PPE35_p.Pro670Leu,-0.1287,-0.1817,-0.0860,0.0000,0.0000,0.0009,2168604.0000,5) Not assoc w R,0.8792,...,0.0003,0.0024,0.9800,0.9842,0.0156,0.1377,1.0145,1.0196,0.1922,0.5084
856,Rv1258c_p.Gly363Val,-0.1288,-0.1399,-0.1158,0.0000,0.0000,0.0000,1406253.0000,3) Uncertain significance,0.8791,...,0.0000,0.0000,0.9965,0.9981,0.0000,0.0000,1.0019,1.0036,0.1799,0.5013
857,Rv3236c_c.546C>T,-0.1577,-0.1859,-0.1337,0.0000,0.0000,0.0000,,,0.8541,...,0.0000,0.0000,0.9916,0.9942,0.0000,0.0000,1.0058,1.0085,0.1835,0.5035


In [19]:
out_dir = "/n/data1/hms/dbmi/farhat/ye12/who/analysis/Moxifloxacin/tiers=1+2/phenos=ALL/dropAF_withSyn"

get_logReg_summary(out_dir)

No variants have comparable sensitivity AND specificity


Unnamed: 0,Sens,Spec,accuracy,balanced_accuracy
0,0.923,0.8945,0.8983,0.9088


In [38]:
df_pza = pd.read_csv("/n/data1/hms/dbmi/farhat/ye12/who/analysis/Pyrazinamide/tiers=1+2/phenos=WHO/dropAF_withSyn/phenos.csv")

# the missing ones might be M. cannettii, most similar to L6 based on the other lineage callers
lineages = pd.read_pickle("../data/combined_lineage_sample_IDs.pkl")
lineages["Lineage"] = lineages["Lineage"].fillna("6")
lineages["Lineage_1"] = lineages["Lineage_1"].fillna("6")

lineages = lineages[["Sample Name", "Sample ID", "Lineage_1"]]
lineages["Lineage"] = [str(val).split(".")[0] for val in lineages["Lineage_1"].values]
lineages.loc[lineages["Lineage"].str.contains("BOV"), "Lineage"] = "M. bovis"

assert len(lineages.loc[pd.isnull(lineages["Lineage"])]) == 0

########## KEEP ONLY ISOLATES WITH ALL 3 PIECES OF DATA ##########

# get only isolates with data for everyting: SNP matrix, in the model, and lineages
combined = lineages.merge(df_pza, left_on="Sample ID", right_on="sample_id")

In [42]:
combined.groupby("Lineage")["phenotype"].mean().sort_values(ascending=False)

Lineage
M. bovis   0.9153
2          0.3750
1          0.1503
6          0.1163
4          0.1112
5          0.1111
3          0.0554
7          0.0000
Name: phenotype, dtype: float64

In [45]:
combined.query("Lineage == 'M. bovis'").Lineage_1.unique()

array(['BOV_AFRI', 'BOV'], dtype=object)