In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 150
import seaborn as sns
from Bio import SeqIO, Seq
import scipy.stats as st
import timeit

import glob, os, yaml, subprocess, itertools, sparse, vcf
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from statsmodels.distributions.empirical_distribution import ECDF

who_variants = pd.read_csv("/n/data1/hms/dbmi/farhat/Sanjana/MIC_data/WHO_resistance_variants_all.csv")
pd.set_option('display.float_format', lambda x: '%.4f' % x)

# Rifampicin

In [2]:
rif_res = pd.read_pickle("/n/data1/hms/dbmi/farhat/ye12/who/analysis/Drugs/Rifampicin/tiers=1_drop_HET/model_analysis.pkl")

# number of principal components with positive coefficients (OR > 1)
print(len(rif_res.query("coef > 0").loc[rif_res.query("coef > 0").orig_variant.str.contains('PC')]))

1


In [3]:
rif_res.query("coef > 0 & pval < 0.05")[["orig_variant", "OR_Lower_CI", "Odds_Ratio", "OR_Upper_CI", "PPV_Lower_CI", "PPV", "PPV_Upper_CI", "pval", "BH_pval", "Bonferroni_pval", "confidence_WHO_2021"]]

Unnamed: 0,orig_variant,OR_Lower_CI,Odds_Ratio,OR_Upper_CI,PPV_Lower_CI,PPV,PPV_Upper_CI,pval,BH_pval,Bonferroni_pval,confidence_WHO_2021
0,rpoB_p.Ser450Leu,19.5081,22.0246,25.3000,0.9847,0.9888,0.9927,0.0000,0.0000,0.0000,1) Assoc w R
1,rpoB_p.Asp435Val,3.1741,3.5438,4.0448,0.9659,0.9808,0.9938,0.0000,0.0000,0.0000,1) Assoc w R
2,rpoB_p.His445Tyr,2.3444,2.5928,2.8423,0.9662,0.9869,1.0000,0.0000,0.0000,0.0000,1) Assoc w R
3,rpoB_p.His445Asp,2.2848,2.5182,2.9203,0.9543,0.9778,0.9947,0.0000,0.0000,0.0000,1) Assoc w R
4,rpoB_p.Ser450Phe,1.7632,1.9538,2.1613,0.9565,0.9872,1.0000,0.0000,0.0000,0.0000,1) Assoc w R
...,...,...,...,...,...,...,...,...,...,...,...
271,rpoB_p.Glu284Gly,0.9999,1.0172,1.0268,1.0000,1.0000,1.0000,0.0449,0.2016,1.0000,3) Uncertain significance
272,rpoB_p.Ala527Thr,1.0000,1.0171,1.0277,1.0000,1.0000,1.0000,0.0436,0.1995,1.0000,
274,rpoB_p.Asn501Lys,0.9999,1.0148,1.0205,1.0000,1.0000,1.0000,0.0338,0.2103,1.0000,3) Uncertain significance
275,rpoB_p.Phe503Val,0.9999,1.0148,1.0205,1.0000,1.0000,1.0000,0.0338,0.2103,1.0000,3) Uncertain significance


In [4]:
who_variants.loc[(who_variants.drug == 'RIF') & (who_variants.confidence.str.contains("|".join(["1", "2"])))]

Unnamed: 0.1,Unnamed: 0,drug,genome_index,confidence,gene,variant
14365,14283,RIF,761154761155,1) Assoc w R,rpoB,rpoB_S450L
14366,14284,RIF,761155,1) Assoc w R,rpoB,rpoB_S450L
14367,14285,RIF,761110,1) Assoc w R,rpoB,rpoB_D435V
14368,14286,RIF,761139,1) Assoc w R,rpoB,rpoB_H445Y
14369,14287,RIF,761139,1) Assoc w R,rpoB,rpoB_H445D
...,...,...,...,...,...,...
14508,14426,RIF,761115,2) Assoc w R - Interim,rpoB,rpoB_N437Y
14509,14427,RIF,761122,2) Assoc w R - Interim,rpoB,rpoB_P439L
14510,14428,RIF,761112761114,2) Assoc w R - Interim,rpoB,rpoB_Q436N
14511,14429,RIF,761085761086,2) Assoc w R - Interim,rpoB,rpoB_T427G


# Amikacin

In [5]:
ami_res = pd.read_pickle("/n/data1/hms/dbmi/farhat/ye12/who/analysis/Drugs/Amikacin/tiers=1_2_drop_HET/model_analysis.pkl")

# number of principal components with positive coefficients (OR > 1)
print(len(ami_res.query("coef > 0").loc[ami_res.query("coef > 0").orig_variant.str.contains('PC')]))

1


In [6]:
ami_res.query("coef > 0 & pval < 0.05")[["orig_variant", "OR_Lower_CI", "Odds_Ratio", "OR_Upper_CI", "PPV_Lower_CI", "PPV", "PPV_Upper_CI", "pval", "BH_pval", "Bonferroni_pval", "confidence_WHO_2021"]]

Unnamed: 0,orig_variant,OR_Lower_CI,Odds_Ratio,OR_Upper_CI,PPV_Lower_CI,PPV,PPV_Upper_CI,pval,BH_pval,Bonferroni_pval,confidence_WHO_2021
0,rrs_n.1401A>G,6.4087,7.0443,8.0294,0.8967,0.9187,0.9385,0.0,0.0,0.0,1) Assoc w R
1,eis_c.-14C>T,1.3814,1.5138,1.6704,0.2047,0.2791,0.3592,0.0,0.0,0.0,1) Assoc w R
2,eis_c.-12C>T,1.1052,1.3425,1.5844,0.0472,0.0659,0.0865,0.0008,0.5171,1.0,5) Not assoc w R
3,rrs_n.1484G>T,1.2022,1.2747,1.4167,0.6667,0.9,1.0,0.0,0.0014,0.0056,2) Assoc w R - Interim
4,rrs_n.517C>T,0.9537,1.2404,1.5236,0.0927,0.1326,0.1743,0.0372,1.0,1.0,5) Not assoc w R
6,eis_c.-10G>A,0.9973,1.1783,1.3266,0.0217,0.0495,0.0814,0.0134,1.0,1.0,3) Uncertain significance
7,rrs_n.160dupT,1.0,1.178,1.2014,1.0,1.0,1.0,0.002,0.966,1.0,
8,aftB_p.Asp397Gly,1.0525,1.1706,1.3059,0.1557,0.1702,0.1839,0.0018,1.0,1.0,5) Not assoc w R
9,eis_c.-37G>T,1.0074,1.1706,1.2919,0.0248,0.066,0.1186,0.007,1.0,1.0,3) Uncertain significance
10,eis_p.Arg106His,0.9719,1.1483,1.2164,0.0,0.3333,1.0,0.0488,1.0,1.0,3) Uncertain significance


In [7]:
who_variants.loc[(who_variants.drug == 'AMI') & (who_variants.confidence.str.contains("|".join(["1", "2"])))]

Unnamed: 0.1,Unnamed: 0,drug,genome_index,confidence,gene,variant
1634,1,AMI,1473246,1) Assoc w R,rrs,rrs_a1401g
1635,2,AMI,2715346,1) Assoc w R,eis,eis_c-14t
1636,3,AMI,1473329,2) Assoc w R - Interim,rrs,rrs_g1484t
1637,4,AMI,1473247,2) Assoc w R - Interim,rrs,rrs_c1402t


# Moxifloxacin

In [19]:
moxi_res = pd.read_pickle("/n/data1/hms/dbmi/farhat/ye12/who/analysis/Drugs/Moxifloxacin/tiers=1_drop_HET/model_analysis.pkl")

# number of principal components with positive coefficients (OR > 1)
print(len(moxi_res.query("coef > 0").loc[moxi_res.query("coef > 0").orig_variant.str.contains('PC')]))

1


In [20]:
moxi_res.query("coef > 0 & pval < 0.05")[["orig_variant", "OR_Lower_CI", "Odds_Ratio", "OR_Upper_CI", "PPV_Lower_CI", "PPV", "PPV_Upper_CI", "pval", "BH_pval", "Bonferroni_pval", "confidence_WHO_2021"]]

Unnamed: 0,orig_variant,OR_Lower_CI,Odds_Ratio,OR_Upper_CI,PPV_Lower_CI,PPV,PPV_Upper_CI,pval,BH_pval,Bonferroni_pval,confidence_WHO_2021
0,gyrA_p.Asp94Gly,5.8685,6.7267,7.8214,0.9202,0.9507,0.9769,0.0,0.0,0.0,1) Assoc w R
1,gyrA_p.Ala90Val,2.261,2.5273,2.8391,0.4967,0.5833,0.6643,0.0,0.0,0.0,1) Assoc w R
2,gyrA_p.Asp94Asn,1.9486,2.2062,2.5763,0.85,0.9429,1.0,0.0,0.0,0.0,1) Assoc w R
3,gyrA_p.Asp94Ala,1.9291,2.1283,2.3767,0.6329,0.7246,0.8214,0.0,0.0,0.0,1) Assoc w R
4,gyrA_p.Asp94Tyr,1.8112,2.0168,2.2653,0.8749,0.963,1.0,0.0,0.0,0.0,1) Assoc w R
5,gyrA_p.Ser91Pro,1.6925,1.8681,2.1279,0.6571,0.8,0.9231,0.0,0.0,0.0,1) Assoc w R
6,gyrB_p.Glu501Asp,1.4879,1.6287,1.7123,1.0,1.0,1.0,0.0,0.0,0.0,1) Assoc w R
7,gyrA_p.Asp89Asn,1.5012,1.5885,1.6538,1.0,1.0,1.0,0.0,0.0,0.0,3) Uncertain significance
8,gyrA_p.Gly88Cys,1.3352,1.4216,1.4754,1.0,1.0,1.0,0.0,0.0,0.0,1) Assoc w R
9,gyrA_p.Asp94His,1.2781,1.3987,1.5851,0.5542,0.875,1.0,0.0,0.0,0.0003,1) Assoc w R


In [10]:
who_variants.loc[(who_variants.drug == 'MXF') & (who_variants.confidence.str.contains("|".join(["1", "2"])))]

Unnamed: 0.1,Unnamed: 0,drug,genome_index,confidence,gene,variant
12725,12535,MXF,7582,1) Assoc w R,gyrA,gyrA_D94G
12726,12536,MXF,7570,1) Assoc w R,gyrA,gyrA_A90V
12727,12537,MXF,7581,1) Assoc w R,gyrA,gyrA_D94N
12728,12538,MXF,7582,1) Assoc w R,gyrA,gyrA_D94A
12729,12539,MXF,7572,1) Assoc w R,gyrA,gyrA_S91P
12730,12540,MXF,7581,1) Assoc w R,gyrA,gyrA_D94Y
12731,12541,MXF,6742,1) Assoc w R,gyrB,gyrB_E501D
12732,12542,MXF,7563,1) Assoc w R,gyrA,gyrA_G88C
12733,12543,MXF,7581,1) Assoc w R,gyrA,gyrA_D94H
12734,12544,MXF,6620,2) Assoc w R - Interim,gyrB,gyrB_D461N


# Pyrazinamide

In [11]:
pza_res = pd.read_pickle("/n/data1/hms/dbmi/farhat/ye12/who/analysis/Drugs/Pyrazinamide/tiers=1_drop_HET/model_analysis.pkl")

# number of principal components with positive coefficients (OR > 1)
print(len(pza_res.query("coef > 0").loc[pza_res.query("coef > 0").orig_variant.str.contains('PC')]))

4


In [12]:
pza_res.head(20)

Unnamed: 0,orig_variant,coef,Lower_CI,Upper_CI,pval,BH_pval,Bonferroni_pval,genome_index,confidence_WHO_2021,Odds_Ratio,OR_Lower_CI,OR_Upper_CI,PPV,NPV,PPV_Lower_CI,PPV_Upper_CI,NPV_Lower_CI,NPV_Upper_CI
0,pncA_p.His57Asp,0.5623,0.5298,0.5966,0.0,0.0,0.0,2289073.0,1) Assoc w R,1.7546,1.6986,1.816,0.9871,0.8505,0.9673,1.0,0.8451,0.8554
1,PC0,0.4427,0.3908,0.5091,0.0,0.0,0.0,,,1.5569,1.4782,1.6638,,,,,,
2,pncA_c.-11A>G,0.3859,0.345,0.434,0.0,0.0,0.0,2289252.0,1) Assoc w R,1.4709,1.412,1.5434,0.9429,0.8478,0.8911,0.981,0.8425,0.8527
3,pncA_p.His51Asp,0.3159,0.2797,0.3474,0.0,0.0,0.0,2289091.0,1) Assoc w R,1.3714,1.3228,1.4154,0.9792,0.8453,0.9273,1.0,0.8396,0.8504
4,pncA_p.Gln10Pro,0.2779,0.2386,0.3311,0.0,0.0,0.0,2289213.0,1) Assoc w R,1.3204,1.2695,1.3925,0.9508,0.8459,0.8906,1.0,0.8403,0.851
5,pncA_p.Gln141Pro,0.2626,0.2222,0.3177,0.0,0.0,0.0,2288820.0,1) Assoc w R,1.3003,1.2489,1.3739,0.9474,0.8457,0.8776,1.0,0.8402,0.8507
6,pncA_p.Leu172Pro,0.2626,0.2442,0.2774,0.0,0.0,0.0,2288727.0,1) Assoc w R,1.3003,1.2766,1.3196,1.0,0.8445,1.0,1.0,0.839,0.8495
7,pncA_p.Thr76Pro,0.2502,0.2126,0.304,0.0,0.0,0.0,2289016.0,1) Assoc w R,1.2843,1.2369,1.3552,0.9348,0.8451,0.8541,1.0,0.8394,0.8502
8,pncA_p.Asp49Gly,0.228,0.1919,0.2786,0.0,0.0,0.0,2289096.0,1) Assoc w R,1.2561,1.2115,1.3212,0.9167,0.8446,0.8205,1.0,0.8392,0.8496
9,pncA_p.His57Arg,0.2124,0.1742,0.2515,0.0,0.0,0.0,2289072.0,1) Assoc w R,1.2366,1.1903,1.2859,0.9677,0.8445,0.8889,1.0,0.8388,0.8495


In [13]:
who_variants.loc[(who_variants.drug == 'PZA') & (who_variants.confidence.str.contains("|".join(["1"])))]

Unnamed: 0.1,Unnamed: 0,drug,genome_index,confidence,gene,variant
13310,13121,PZA,2289073,1) Assoc w R,pncA,pncA_H57D
13311,13122,PZA,2289252,1) Assoc w R,pncA,pncA_a-11g
13312,13123,PZA,2289213,1) Assoc w R,pncA,pncA_Q10P
13313,13124,PZA,2289091,1) Assoc w R,pncA,pncA_H51D
13314,13125,PZA,2289016,1) Assoc w R,pncA,pncA_T76P
...,...,...,...,...,...,...
13418,13229,PZA,2288777,1) Assoc w R,pncA,pncA_465_ins_1_c_ca
13419,13230,PZA,2288955,1) Assoc w R,pncA,pncA_K96R
13420,13231,PZA,2289222,1) Assoc w R,pncA,pncA_V7A
13421,13232,PZA,2288885,1) Assoc w R,pncA,pncA_W119C


# Bedaquiline

In [18]:
who_variants.loc[(who_variants.drug == 'BDQ') & (who_variants.confidence.str.contains("|".join(["1", "2"])))]

Unnamed: 0.1,Unnamed: 0,drug,genome_index,confidence,gene,variant


In [15]:
bdq_res = pd.read_pickle("/n/data1/hms/dbmi/farhat/ye12/who/analysis/Drugs/Bedaquiline/tiers=1_syn_binarize/model_analysis.pkl")

# number of principal components with positive coefficients (OR > 1)
print(len(bdq_res.query("coef > 0").loc[bdq_res.query("coef > 0").orig_variant.str.contains('PC')]))

2


In [22]:
bdq_res

Unnamed: 0,orig_variant,coef,Lower_CI,Upper_CI,pval,BH_pval,Bonferroni_pval,genome_index,confidence_WHO_2021,Odds_Ratio,OR_Lower_CI,OR_Upper_CI,PPV,NPV,PPV_Lower_CI,PPV_Upper_CI,NPV_Lower_CI,NPV_Upper_CI
0,PC0,0.5596,0.3513,0.8065,,,,,,1.75,1.421,2.2401,,,,,,
1,mmpL5_p.Thr794Ile,0.31,0.1335,0.5028,,,,776100.0,5) Not assoc w R,1.3634,1.1428,1.6534,0.3263,0.8897,0.2795,0.3788,0.8357,0.938
2,mmpL5_p.Arg328Gly,0.2199,0.0,0.2736,,,,,,1.2459,1.0,1.3147,1.0,0.7409,1.0,1.0,0.6974,0.7775
3,pepQ_c.72C>T,0.1278,-0.0,0.1716,,,,,,1.1363,1.0,1.1872,1.0,0.7377,1.0,1.0,0.6957,0.774
4,atpE_c.54C>A,0.1266,-0.0,0.1733,,,,,,1.135,1.0,1.1892,1.0,0.7377,1.0,1.0,0.6957,0.7745
5,atpE_p.Ala63Pro,0.1251,-0.0,0.1709,,,,,,1.1332,1.0,1.1864,1.0,0.7377,1.0,1.0,0.6959,0.7735
6,pepQ_p.Ala141Glu,0.1232,-0.0,0.1666,,,,,,1.1311,1.0,1.1813,1.0,0.7377,1.0,1.0,0.6951,0.774
7,mmpL5_c.183G>A,0.1157,-0.1078,0.2968,,,,,,1.1227,0.8978,1.3456,0.5,0.7372,0.0,1.0,0.6951,0.774
8,PC3,0.0178,-0.1897,0.2103,,,,,,1.0179,0.8272,1.2341,,,,,,
9,pepQ_p.Arg7Gln,-0.0041,-0.2405,0.2248,,,,2860399.0,3) Uncertain significance,0.9959,0.7862,1.252,0.5,0.7372,0.0,1.0,0.6951,0.7725
