In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 125
import seaborn as sns
from Bio import SeqIO, Seq
import scipy.stats as st
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import glob, os, yaml, subprocess, itertools, sparse, pickle

who_variants = pd.read_csv("/n/data1/hms/dbmi/farhat/Sanjana/MIC_data/WHO_resistance_variants_all.csv")
pd.set_option('display.float_format', lambda x: '%.4f' % x)
samples_summary = pd.read_csv("../data/samples_summary.csv")
analysis_dir = '/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue'

In [2]:
drug = "Delamanid"
# df_phenos = pd.read_csv(os.path.join(analysis_dir, drug, "phenos_binary.csv"))

# df_genos = pd.read_csv(os.path.join(analysis_dir, drug, "genos.csv.gz"), compression="gzip")
# df_genos["orig_variant"] = df_genos["resolved_symbol"] + "_" + df_genos["variant_category"]

In [21]:
def compute_predictive_values(combined_df, return_stats=[]):
    '''
    Compute positive predictive value. 
    Compute sensitivity, specificity, and positive and negative likelihood ratios. 
    
    PPV = true_positive / all_positive. NPV = true_negative / all_negative
    Sens = true_positive / (true_positive + false_negative)
    Spec = true_negative / (true_negative + false_positive)
    
    Also return the number of isolates with each variant = all_positive
    
    Positive LR = sens / (1 – spec)
    Negative LR = (1 – sens) / spec

    '''
    # make a copy to keep sample_id in one dataframe
    melted = combined_df.melt(id_vars=["sample_id", "phenotype"])
    melted_2 = melted.copy()
    del melted_2["sample_id"]
    
    # get counts of isolates grouped by phenotype and variant -- so how many isolates have a variant and have a phenotype (all 4 possibilities)
    grouped_df = pd.DataFrame(melted_2.groupby(["phenotype", "variable"]).value_counts()).reset_index()
    grouped_df = grouped_df.rename(columns={"variable": "orig_variant", "value": "variant", 0:"count"})
    
    # dataframes of the counts of the 4 values
    true_pos_df = grouped_df.query("variant == 1 & phenotype == 1").rename(columns={"count": "TP"})
    false_pos_df = grouped_df.query("variant == 1 & phenotype == 0").rename(columns={"count": "FP"})
    true_neg_df = grouped_df.query("variant == 0 & phenotype == 0").rename(columns={"count": "TN"})
    false_neg_df = grouped_df.query("variant == 0 & phenotype == 1").rename(columns={"count": "FN"})

    assert len(true_pos_df) + len(false_pos_df) + len(true_neg_df) + len(false_neg_df) == len(grouped_df)
    
    # combine the 4 dataframes into a single dataframe (concatenating on axis = 1)
    final = true_pos_df[["orig_variant", "TP"]].merge(
            false_pos_df[["orig_variant", "FP"]], on="orig_variant", how="outer").merge(
            true_neg_df[["orig_variant", "TN"]], on="orig_variant", how="outer").merge(
            false_neg_df[["orig_variant", "FN"]], on="orig_variant", how="outer").fillna(0)

    assert len(final) == len(melted["variable"].unique())
    assert len(final) == len(final.drop_duplicates("orig_variant"))
        
    final["Num_Isolates"] = final["TP"] + final["FP"]
    final["Total_Isolates"] = final["TP"] + final["FP"] + final["TN"] + final["FN"]
    final["PPV"] = final["TP"] / (final["TP"] + final["FP"])
    final["Sens"] = final["TP"] / (final["TP"] + final["FN"])
    final["Spec"] = final["TN"] / (final["TN"] + final["FP"])
    final["LR+"] = final["Sens"] / (1 - final["Spec"])
    final["LR-"] = (1 - final["Sens"]) / final["Spec"]
    #final["NPV"] = final["TN"] / (final["TN"] + final["FN"])
    
    if len(return_stats) == 0:
        return final[["orig_variant", "Num_Isolates", "Total_Isolates", "TP", "FP", "TN", "FN", "PPV", "Sens", "Spec", "LR+", "LR-"]]
    else:
        return final[return_stats]
    

In [95]:
# final_analysis file with all significant variants for a drug
res_df = pd.read_csv(os.path.join(analysis_dir, drug, "final_analysis.csv"))

In [96]:
res_df

Unnamed: 0,orig_variant,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,confidence_WHO_2021,Odds_Ratio,OR_LB,...,PPV_UB,Sens_LB,Sens_UB,Spec_LB,Spec_UB,LR+_LB,LR+_UB,LR-_LB,LR-_UB,predicted_effect
0,ddn_lof,0.2951,0.1698,0.3745,0.0000,0.0000,0.0000,,1.3432,1.1851,...,0.9500,0.0446,0.1092,0.9991,0.9999,73.3231,852.3245,0.8908,0.9559,lof
1,fgd1_lof,0.1563,-0.0000,0.2080,0.0041,0.0332,0.2990,,1.1692,1.0000,...,1.0000,0.0000,0.0194,1.0000,1.0000,inf,inf,0.9806,1.0000,lof
2,ddn_p.Val127Phe,0.1547,-0.0273,0.2292,0.0025,0.0299,0.1792,,1.1673,0.9731,...,1.0000,0.0000,0.0263,0.9996,1.0000,0.0000,inf,0.9739,1.0002,missense_variant
3,ddn_p.Leu49Pro,0.1377,0.1170,0.1530,0.0000,0.0000,0.0000,2) Assoc w R - Interim,1.1476,1.1241,...,1.0000,0.0210,0.0723,0.9996,1.0000,75.1657,inf,0.9278,0.9792,missense_variant
4,fbiB_p.Lys448Arg,0.1318,-0.0342,0.2785,0.0537,0.1189,1.0000,3) Uncertain significance,1.1409,0.9664,...,0.1667,0.0410,0.1087,0.9853,0.9895,3.2246,9.1374,0.9028,0.9713,missense_variant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
953,fbiB_p.Leu447Arg,-0.0649,-0.0788,-0.0504,0.0000,0.0000,0.0000,3) Uncertain significance,0.9371,0.9242,...,,,,,,,,,,missense_variant
954,fgd1_c.465C>T,-0.0783,-0.0901,-0.0657,0.0000,0.0000,0.0000,,0.9247,0.9139,...,,,,,,,,,,synonymous_variant
955,fbiA_p.Arg304Gln,-0.0799,-0.1325,0.0000,0.0480,0.1129,1.0000,3) Uncertain significance,0.9232,0.8759,...,,,,,,,,,,missense_variant
956,ndh_lof,-0.1093,-0.1496,-0.0602,0.0000,0.0000,0.0001,,0.8964,0.8610,...,,,,,,,,,,lof


In [97]:
res_df.columns

Index(['orig_variant', 'coef', 'coef_LB', 'coef_UB', 'pval', 'BH_pval',
       'Bonferroni_pval', 'confidence_WHO_2021', 'Odds_Ratio', 'OR_LB',
       'OR_UB', 'Tier', 'Phenos', 'unpooled', 'Num_Isolates', 'Total_Isolates',
       'TP', 'FP', 'TN', 'FN', 'PPV', 'Sens', 'Spec', 'LR+', 'LR-', 'PPV_LB',
       'PPV_UB', 'Sens_LB', 'Sens_UB', 'Spec_LB', 'Spec_UB', 'LR+_LB',
       'LR+_UB', 'LR-_LB', 'LR-_UB', 'predicted_effect'],
      dtype='object')

In [99]:
res_df.loc[pd.isnull(res_df["Sens"])]

Unnamed: 0,orig_variant,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,confidence_WHO_2021,Odds_Ratio,OR_LB,...,PPV_UB,Sens_LB,Sens_UB,Spec_LB,Spec_UB,LR+_LB,LR+_UB,LR-_LB,LR-_UB,predicted_effect
11,PC4,0.1016,-0.0728,0.2469,0.1077,0.1228,1.0,,1.1069,0.9298,...,,,,,,,,,,
18,PC2,0.0605,-0.119,0.2391,0.2571,0.2844,1.0,,1.0624,0.8878,...,,,,,,,,,,
71,PC3,0.03,-0.111,0.1728,0.3431,0.3738,1.0,,1.0305,0.895,...,,,,,,,,,,
88,PC1,-0.0039,-0.1845,0.1639,0.4829,0.5109,1.0,,0.9961,0.8316,...,,,,,,,,,,
957,PC0,-0.2291,-0.4017,-0.0775,0.0026,0.0274,0.1919,,0.7953,0.6692,...,,,,,,,,,,


In [None]:
res_df = res_df[res_df.columns[~res_df.columns.str.contains("|".join(["_x", "y"]))]]


In [56]:
# final_analysis file with all significant variants for a drug
res_df = pd.read_csv(os.path.join(analysis_dir, drug, "final_analysis.csv"))

df_phenos = pd.read_csv(os.path.join(analysis_dir, drug, "phenos_binary.csv"))
df_genos = pd.read_csv(os.path.join(analysis_dir, drug, "genos.csv.gz"), compression="gzip")
df_genos["orig_variant"] = df_genos["resolved_symbol"] + "_" + df_genos["variant_category"]
df_copy = df_genos.copy()

# pool LOF and inframe mutations
df_copy.loc[df_copy["predicted_effect"].isin(["frameshift", "start_lost", "stop_gained", "feature_ablation"]), ["variant_category", "position"]] = ["lof", np.nan]
df_copy.loc[df_copy["predicted_effect"].isin(["inframe_insertion", "inframe_deletion"]), ["variant_category", "position"]] = ["inframe", np.nan]

# update the orig_variant column using the new variant categories (lof and inframe) and combine the pooled and unpooled variants
df_copy["orig_variant"] = df_copy["resolved_symbol"] + "_" + df_copy["variant_category"]
df_pooled = df_copy.query(f"variant_category in ['lof', 'inframe']").sort_values(by=["variant_binary_status", "variant_allele_frequency"], ascending=False, na_position="last").drop_duplicates(subset=["sample_id", "orig_variant"], keep="first")
del df_copy
df_genos_full = pd.concat([df_genos, df_pooled], axis=0)
del df_genos
del df_pooled

# keep only variants that are in the final_analysis dataframe and drop NaNs (NaNs = either isolate didn't pass QC or it's a Het) 
# We can't process Hets here because they need to be binary to have univariate statistics
df_genos_full = df_genos_full.loc[(df_genos_full["orig_variant"].isin(res_df["orig_variant"].values))].dropna(subset="variant_binary_status")

# check that the only variants that are in res_df but not in df_genos_full are the principal components
if sum(~pd.Series(list(set(res_df["orig_variant"]) - set(df_genos_full["orig_variant"]))).str.contains("PC")) > 0:
    raise ValueError("Variants are missing from df_genos_full!")

combined = df_genos_full.pivot(index="sample_id", columns="orig_variant", values="variant_binary_status")

# predicted effect annotations for later
annotated_genos = df_genos_full.query("variant_category not in ['lof', 'inframe']").drop_duplicates(["orig_variant", "predicted_effect"])
del df_genos_full
combined = combined.merge(df_phenos[["sample_id", "phenotype"]], left_index=True, right_on="sample_id").reset_index(drop=True)

# get dataframe of predictive values for the non-zero coefficients and add them to the results dataframe
full_predict_values = compute_predictive_values(combined)
res_df = res_df.merge(full_predict_values, on="orig_variant", how="outer")

print(f"Computing and bootstrapping predictive values with {num_bootstrap} replicates")
# Can't compute univariate stats for PCs. For tractability, only compute bootstrap stats for variants with positive coefficients.
# The stats for variants with negative coefficients are often edge numbers and not informative, so this saves time. 
keep_variants = list(res_df.loc[(~res_df["orig_variant"].str.contains("PC")) & (res_df["coef"] > 0)]["orig_variant"].values)

# Remake this dataframe with fewer features because only going to bootstrap stats for variants with positive coefficients.
# check that all samples were preserved. 
combined_small = combined[["sample_id", "phenotype"] + keep_variants]
assert len(combined_small) == len(combined)

  df_genos = pd.read_csv(os.path.join(analysis_dir, drug, "genos.csv.gz"), compression="gzip")


Computing and bootstrapping predictive values with 2 replicates


In [57]:
res_df.head()

Unnamed: 0,orig_variant,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,confidence_WHO_2021,Odds_Ratio,OR_LB,...,Total_Isolates,TP,FP,TN,FN,PPV,Sens,Spec,LR+,LR-
0,ddn_lof,0.2951,0.1698,0.3745,0.0,0.0,0.0,,1.3432,1.1851,...,11803.0,19.0,5.0,11546.0,233.0,0.7917,0.0754,0.9996,174.1817,0.925
1,fgd1_lof,0.1563,-0.0,0.208,0.0041,0.0332,0.299,,1.1692,1.0,...,11803.0,2.0,0.0,11551.0,250.0,1.0,0.0079,1.0,inf,0.9921
2,ddn_p.Val127Phe,0.1547,-0.0273,0.2292,0.0025,0.0299,0.1792,,1.1673,0.9731,...,11803.0,3.0,2.0,11549.0,249.0,0.6,0.0119,0.9998,68.756,0.9883
3,ddn_p.Leu49Pro,0.1377,0.117,0.153,0.0,0.0,0.0,2) Assoc w R - Interim,1.1476,1.1241,...,11803.0,11.0,2.0,11549.0,241.0,0.8462,0.0437,0.9998,252.1052,0.9565
4,fbiB_p.Lys448Arg,0.1318,-0.0342,0.2785,0.0537,0.1189,1.0,3) Uncertain significance,1.1409,0.9664,...,11803.0,19.0,146.0,11405.0,233.0,0.1152,0.0754,0.9874,5.9651,0.9364


In [58]:
num_bootstrap = 2
bs_results = pd.DataFrame(columns = keep_variants)

# need confidence intervals for 5 stats: PPV, sens, spec, + likelihood ratio, - likelihood ratio
for i in range(num_bootstrap):

    # get bootstrap sample
    bs_idx = np.random.choice(np.arange(0, len(combined_small)), size=len(combined_small), replace=True)
    bs_combined = combined_small.iloc[bs_idx, :]

    # check ordering of features because we're just going to append bootstrap dataframes
    assert sum(bs_combined.columns[2:] != bs_results.columns) == 0

    # get predictive values from the dataframe of bootstrapped samples. Only return the 5 we want CI for, and the variant
    bs_values = compute_predictive_values(bs_combined, return_stats=["orig_variant", "PPV", "Sens", "Spec", "LR+", "LR-"])
    bs_results = pd.concat([bs_results, bs_values.set_index("orig_variant").T], axis=0)

    # if i % int(num_bootstrap / 10) == 0:
    #     print(i)

# ensure everything is float because had some issues with np.nanpercentile giving an error about incompatible data types
bs_results = bs_results.astype(float)
if len(bs_results.index.unique()) != 5:
    print(bs_results.index.unique())

In [59]:
res_df.head()

Unnamed: 0,orig_variant,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,confidence_WHO_2021,Odds_Ratio,OR_LB,...,Total_Isolates,TP,FP,TN,FN,PPV,Sens,Spec,LR+,LR-
0,ddn_lof,0.2951,0.1698,0.3745,0.0,0.0,0.0,,1.3432,1.1851,...,11803.0,19.0,5.0,11546.0,233.0,0.7917,0.0754,0.9996,174.1817,0.925
1,fgd1_lof,0.1563,-0.0,0.208,0.0041,0.0332,0.299,,1.1692,1.0,...,11803.0,2.0,0.0,11551.0,250.0,1.0,0.0079,1.0,inf,0.9921
2,ddn_p.Val127Phe,0.1547,-0.0273,0.2292,0.0025,0.0299,0.1792,,1.1673,0.9731,...,11803.0,3.0,2.0,11549.0,249.0,0.6,0.0119,0.9998,68.756,0.9883
3,ddn_p.Leu49Pro,0.1377,0.117,0.153,0.0,0.0,0.0,2) Assoc w R - Interim,1.1476,1.1241,...,11803.0,11.0,2.0,11549.0,241.0,0.8462,0.0437,0.9998,252.1052,0.9565
4,fbiB_p.Lys448Arg,0.1318,-0.0342,0.2785,0.0537,0.1189,1.0,3) Uncertain significance,1.1409,0.9664,...,11803.0,19.0,146.0,11405.0,233.0,0.1152,0.0754,0.9874,5.9651,0.9364


In [61]:
res_df.columns

Index(['orig_variant', 'coef', 'coef_LB', 'coef_UB', 'pval', 'BH_pval',
       'Bonferroni_pval', 'confidence_WHO_2021', 'Odds_Ratio', 'OR_LB',
       'OR_UB', 'Tier', 'Phenos', 'unpooled', 'synonymous', 'Num_Isolates',
       'Total_Isolates', 'TP', 'FP', 'TN', 'FN', 'PPV', 'Sens', 'Spec', 'LR+',
       'LR-'],
      dtype='object')

In [60]:
bs_results.head()

Unnamed: 0,ddn_lof,fgd1_lof,ddn_p.Val127Phe,ddn_p.Leu49Pro,fbiB_p.Lys448Arg,fbiC_lof,fbiA_lof,fgd1_inframe,fbiB_inframe,ddn_inframe,...,fbiC_c.2427C>G,fbiB_c.1335G>A,fbiA_p.Asn314Lys,ddn_p.Trp20*,fbiB_c.90C>T,fbiA_p.Leu104Phe,fbiA_c.337C>T,fbiA_p.Ala206Thr,fgd1_p.Val170Met,fbiA_c.15T>C
PPV,0.8889,1.0,1.0,0.75,0.1446,0.1111,0.6,1.0,1.0,1.0,...,0.0,1.0,1.0,1.0,0.4286,1.0,0.0833,,0.0045,0.0204
Sens,0.096,0.004,0.016,0.048,0.096,0.004,0.012,0.008,0.008,0.016,...,0.0,0.02,0.004,0.012,0.012,0.004,0.02,0.0,0.004,0.004
Spec,0.9997,1.0,1.0,0.9997,0.9877,0.9993,0.9998,1.0,1.0,1.0,...,0.9996,1.0,1.0,1.0,0.9997,1.0,0.9952,1.0,0.9809,0.9958
LR+,369.696,inf,inf,138.636,7.8105,5.7765,69.318,inf,inf,inf,...,0.0,inf,inf,inf,34.659,inf,4.2011,,0.2091,0.9628
LR-,0.9042,0.996,0.984,0.9523,0.9152,0.9967,0.9882,0.992,0.992,0.984,...,1.0004,0.98,0.996,0.988,0.9883,0.996,0.9847,1.0,1.0154,1.0002


In [63]:
final_res

Unnamed: 0,orig_variant,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,confidence_WHO_2021,Odds_Ratio,OR_LB,...,LR+_UB,LR-_LB,LR-_UB,sample_id,resolved_symbol,variant_category,predicted_effect,variant_allele_frequency,variant_binary_status,position
0,ddn_lof,0.2951,0.1698,0.3745,0.0000,0.0000,0.0000,,1.3432,1.1851,...,364.7908,0.9048,0.9247,,,,lof,,,
1,fgd1_lof,0.1563,-0.0000,0.2080,0.0041,0.0332,0.2990,,1.1692,1.0000,...,inf,0.9960,0.9962,,,,lof,,,
2,ddn_p.Val127Phe,0.1547,-0.0273,0.2292,0.0025,0.0299,0.1792,,1.1673,0.9731,...,inf,0.9841,0.9887,830305.0000,ddn,p.Val127Phe,missense_variant,0.0000,0.0000,3987222
3,ddn_p.Leu49Pro,0.1377,0.1170,0.1530,0.0000,0.0000,0.0000,2) Assoc w R - Interim,1.1476,1.1241,...,257.1932,0.9331,0.9518,374974.0000,ddn,p.Leu49Pro,missense_variant,0.0000,0.0000,3986989
4,fbiB_p.Lys448Arg,0.1318,-0.0342,0.2785,0.0537,0.1189,1.0000,3) Uncertain significance,1.1409,0.9664,...,7.7711,0.9156,0.9293,23145.0000,fbiB,p.Lys448Arg,missense_variant,0.0000,0.0000,3642877
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
953,fbiB_p.Leu447Arg,-0.0649,-0.0788,-0.0504,0.0000,0.0000,0.0000,3) Uncertain significance,0.9371,0.9242,...,,,,7009.0000,fbiB,p.Leu447Arg,missense_variant,0.0000,0.0000,3642874
954,fgd1_c.465C>T,-0.0783,-0.0901,-0.0657,0.0000,0.0000,0.0000,,0.9247,0.9139,...,,,,6993.0000,fgd1,c.465C>T,synonymous_variant,0.0000,0.0000,491247
955,fbiA_p.Arg304Gln,-0.0799,-0.1325,0.0000,0.0480,0.1129,1.0000,3) Uncertain significance,0.9232,0.8759,...,,,,6852.0000,fbiA,p.Arg304Gln,missense_variant,0.0000,0.0000,3641453
956,ndh_lof,-0.1093,-0.1496,-0.0602,0.0000,0.0000,0.0001,,0.8964,0.8610,...,,,,,,,lof,,,


In [62]:
# add the confidence intervals to the dataframe
for variable in ["PPV", "Sens", "Spec", "LR+", "LR-"]:

    lower, upper = np.nanpercentile(bs_results.loc[variable, :], q=[2.5, 97.5], axis=0)

    # LR+ can be infinite if spec is 1, and after percentile, it will be NaN, so replace with infinity
    if variable == "LR+":
        res_df[variable] = res_df[variable].fillna(np.inf)
        lower[np.isnan(lower)] = np.inf
        upper[np.isnan(upper)] = np.inf

    res_df = res_df.merge(pd.DataFrame({"orig_variant": bs_results.columns, 
                        f"{variable}_LB": lower,
                        f"{variable}_UB": upper,
                       }), on="orig_variant", how="outer")

    # sanity checks -- lower bounds should be <= true values, and upper bounds should be >= true values
    # numerical precision can make this fail though, so commented out for now
    # assert sum(res_df[variable] < res_df[f"{variable}_LB"]) == 0
    # assert sum(res_df[variable] > res_df[f"{variable}_UB"]) == 0

# get effect annotations and merge them with the results dataframe
final_res = res_df.merge(annotated_genos, on="orig_variant", how="outer")
final_res = final_res.loc[~pd.isnull(final_res["coef"])]
final_res.loc[final_res["orig_variant"].str.contains("lof"), "predicted_effect"] = "lof"
final_res.loc[final_res["orig_variant"].str.contains("inframe"), "predicted_effect"] = "inframe"

  result = np.apply_along_axis(_nanquantile_1d, axis, a, q,
  diff_b_a = subtract(b, a)
  subtract(b, diff_b_a * (1 - t), out=lerp_interpolation, where=t>=0.5)
  result = np.apply_along_axis(_nanquantile_1d, axis, a, q,


# Combined Analysis Files

In [None]:
def final_processing(drug):
    '''
    Functions for processing outputs before sending to everyone else.
    
    1. Remove principal components (will describe them separately)
    2. Add LOF to the predicted_effect column for pooled LOF mutations
    3. Remove genome_index column (should actually do that earlier, but will fix later)
    4. Remove the logistic regression coefficient columns (they will prefer to work with odds ratios)
    5. Any other column renaming or dropping for clarity
    '''
    
    analysis_df = pd.read_csv(f"/n/data1/hms/dbmi/farhat/ye12/who/analysis/{drug}/final_analysis.csv")
    analysis_df.rename(columns={"orig_variant": "mutation", "Tier1_only": "Tier", "WHO_phenos": "Phenos"}, inplace=True)
    
    # remove logReg coefficients. Keep only odds ratios. Remove the other two columns, which were present mainly for me to see
    # if we were picking up many mutations that were in the 2021 mutation catalog
    del analysis_df["genome_index"]
    del analysis_df["confidence_WHO_2021"]
    analysis_df = analysis_df[analysis_df.columns[~analysis_df.columns.str.contains("coef")]]
    
    # remove significant principal components and replace the NaNs in the predicted effect column for the gene loss of functions
    analysis_df = analysis_df.loc[~analysis_df["mutation"].str.contains("PC")]
    analysis_df.loc[analysis_df["mutation"].str.contains("lof"), "predicted_effect"] = "LOF"
    
    # predicted effect should not be NaN for anything. position is NaN only for the pooled LOF mutations
    assert len(analysis_df.loc[pd.isnull(analysis_df["predicted_effect"])]) == 0
    assert len(analysis_df.loc[(~analysis_df["mutation"].str.contains("lof")) & (pd.isnull(analysis_df["position"]))]) == 0
    
    analysis_df["Tier"] = analysis_df["Tier"].map({1: 1, 0: 2})
    analysis_df["Phenos"] = analysis_df["Phenos"].map({1: "WHO", 0: "ALL"})
    
    # reorder columns
    analysis_df = analysis_df[['mutation', 'predicted_effect', 'position', 'Odds_Ratio', 'OR_LB', 'OR_UB', 'pval', 'BH_pval', 'Bonferroni_pval',
       'Num_Isolates', 'Total_Isolates', 'TP', 'FP', 'TN', 'FN', 'Sens_LB', 'Sens', 'Sens_UB', 'Spec_LB', 'Spec', 'Spec_UB', 'PPV', 'PPV_LB', 'PPV_UB',
       'LR+_LB', 'LR+', 'LR+_UB', 'LR-_LB', 'LR-', 'LR-_UB', 'Tier', 'Phenos', 'poolLOF', 'Syn']]
    
    analysis_df[['poolLOF', "Syn"]] = analysis_df[['poolLOF', "Syn"]].astype(int)
        
    return analysis_df

In [None]:
analysis_dir = "/n/data1/hms/dbmi/farhat/ye12/who/analysis"
finished_drugs = []

for drug in os.listdir(analysis_dir):
    
    if os.path.isfile(os.path.join(analysis_dir, drug, "final_analysis.csv")):
        finished_drugs.append(drug)
        # drug_analyses.append(final_processing(drug))

In [None]:
print(finished_drugs)

In [None]:
# those that are actually done
finished_drugs = ['Pyrazinamide', 'Amikacin', 'Clofazimine', 'Linezolid', 'Moxifloxacin', 'Kanamycin', 'Bedaquiline', 'Capreomycin', 'Delamanid']

drug_analyses = {}
for drug in finished_drugs:
    drug_analyses[drug] = final_processing(drug)
    
# write results to an Excel file, where each sheet is named for a drug
with pd.ExcelWriter("Farhat_logReg_analysis.xlsx") as file:
   
    for key, val in drug_analyses.items():
        val.to_excel(file, sheet_name=key, index=False)

In [None]:
drug_analyses[4].query("Tier==2&Phenos=='WHO'")

In [None]:
drug_analyses[0].query("Tier==1&Phenos=='ALL'")

In [None]:
drug_analyses[0].query("predicted_effect=='synonymous_variant' & ")

In [None]:
moxi_df.loc[pd.isnull(moxi_df["position"])]

# Catalog-Based Method Comparison

In [None]:
def get_logReg_summary(out_dir):
    
    model_summary = pd.read_csv(os.path.join(out_dir, "logReg_summary.csv"))
    model_analysis = pd.read_csv(os.path.join(out_dir, "model_analysis.csv"))
    
    summary_sens = model_summary.loc[0, "Sens"]
    summary_spec = model_summary.loc[0, "Spec"]
    
    #max_idx = [np.argmax(model_analysis["Sens"]), np.argmax(model_analysis["Spec"])]
    
    better_variant = model_analysis.query("Sens >= @summary_sens & Spec >= @summary_spec")
    
    if len(better_variant) == 0:
        print("No variants have comparable sensitivity AND specificity")
    else:
        print(better_variant)
    return model_summary, model_analysis
    
    #return model_summary, model_analysis.iloc[max_idx, :][["orig_variant", "coef", "Sens", "Spec", "accuracy", "balanced_accuracy"]]

In [None]:
#out_dir = "/n/data1/hms/dbmi/farhat/ye12/who/analysis/Levofloxacin/tiers=1+2/phenos=ALL/dropAF_withSyn"
out_dir = "/n/data1/hms/dbmi/farhat/ye12/who/analysis/Pyrazinamide/tiers=1+2/phenos=WHO/dropAF_withSyn"

summary, analysis = get_logReg_summary(out_dir)
summary

In [None]:
analysis.query("Sens > 0.9006")

In [None]:
analysis.loc[analysis["orig_variant"].str.contains("PC")]

In [None]:
analysis.query("Spec > 0.9575")

In [None]:
out_dir = "/n/data1/hms/dbmi/farhat/ye12/who/analysis/Moxifloxacin/tiers=1+2/phenos=ALL/dropAF_withSyn"

get_logReg_summary(out_dir)

In [None]:
df_pza = pd.read_csv("/n/data1/hms/dbmi/farhat/ye12/who/analysis/Pyrazinamide/tiers=1+2/phenos=WHO/dropAF_withSyn/phenos.csv")

# the missing ones might be M. cannettii, most similar to L6 based on the other lineage callers
lineages = pd.read_pickle("../data/combined_lineage_sample_IDs.pkl")
lineages["Lineage"] = lineages["Lineage"].fillna("6")
lineages["Lineage_1"] = lineages["Lineage_1"].fillna("6")

lineages = lineages[["Sample Name", "Sample ID", "Lineage_1"]]
lineages["Lineage"] = [str(val).split(".")[0] for val in lineages["Lineage_1"].values]
lineages.loc[lineages["Lineage"].str.contains("BOV"), "Lineage"] = "M. bovis"

assert len(lineages.loc[pd.isnull(lineages["Lineage"])]) == 0

########## KEEP ONLY ISOLATES WITH ALL 3 PIECES OF DATA ##########

# get only isolates with data for everyting: SNP matrix, in the model, and lineages
combined = lineages.merge(df_pza, left_on="Sample ID", right_on="sample_id")

In [None]:
combined.groupby("Lineage")["phenotype"].mean().sort_values(ascending=False)

In [None]:
combined.query("Lineage == 'M. bovis'").Lineage_1.unique()