In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 125
import seaborn as sns
from Bio import SeqIO, Seq
import scipy.stats as st
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import glob, os, yaml, subprocess, itertools, sparse, pickle

who_variants = pd.read_csv("/n/data1/hms/dbmi/farhat/Sanjana/MIC_data/WHO_resistance_variants_all.csv")
pd.set_option('display.float_format', lambda x: '%.4f' % x)
samples_summary = pd.read_csv("../data/samples_summary.csv")
analysis_dir = '/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue'

# Combined Analysis Files

In [34]:
def final_processing(drug):
    '''
    Functions for processing outputs before sending to everyone else.
    
    1. Remove principal components (will describe them separately)
    2. Add LOF to the predicted_effect column for pooled LOF mutations
    3. Remove genome_index column (should actually do that earlier, but will fix later)
    4. Remove the logistic regression coefficient columns (they will prefer to work with odds ratios)
    5. Any other column renaming or dropping for clarity
    '''
    
    analysis_df = pd.read_csv(os.path.join(analysis_dir, drug, "final_analysis.csv"))
    analysis_df.rename(columns={"orig_variant": "mutation"}, inplace=True)
    
    # remove logReg coefficients. Keep only odds ratios. Remove the other two columns, which were present mainly for me to see
    # if we were picking up many mutations that were in the 2021 mutation catalog
    del analysis_df["confidence_WHO_2021"]
    analysis_df = analysis_df[analysis_df.columns[~analysis_df.columns.str.contains("coef")]]
    
    # remove significant principal components and replace the NaNs in the predicted effect column for the gene loss of functions
    analysis_df = analysis_df.loc[~analysis_df["mutation"].str.contains("PC")]
    
    # predicted effect should not be NaN for anything. position is NaN only for the pooled LOF mutations
    assert len(analysis_df.loc[pd.isnull(analysis_df["predicted_effect"])]) == 0
    
    assert len(analysis_df[['Num_Isolates', 'Total_Isolates', 'TP', 'FP', 'TN', 'FN']].dropna()) == len(analysis_df)
    analysis_df[['Num_Isolates', 'Total_Isolates', 'TP', 'FP', 'TN', 'FN']] = analysis_df[['Num_Isolates', 'Total_Isolates', 'TP', 'FP', 'TN', 'FN']].astype(int)
    
    # reorder columns
    analysis_df = analysis_df[['mutation', 'predicted_effect', 'OR_LB', 'Odds_Ratio', 'OR_UB', 'pval', 'BH_pval', 'Bonferroni_pval',
       'Num_Isolates', 'Total_Isolates', 'TP', 'FP', 'TN', 'FN', 'PPV_LB', 'PPV', 'PPV_UB', 'Sens_LB', 'Sens', 'Sens_UB', 'Spec_LB', 'Spec', 'Spec_UB',
       'LR+_LB', 'LR+', 'LR+_UB', 'LR-_LB', 'LR-', 'LR-_UB', 'Tier', 'Phenos', 'unpooled', 'synonymous']]
            
    return analysis_df

In [40]:
# those that are actually done
finished_drugs = np.sort(['Capreomycin', 'Amikacin', 'Kanamycin', 'Ethionamide', 'Levofloxacin', 'Clofazimine', 'Linezolid', 'Moxifloxacin', 'Bedaquiline', 'Delamanid'])

drug_analyses = {}
for drug in finished_drugs:
    drug_analyses[drug] = final_processing(drug)
    
# write results to an Excel file, where each sheet is named for a drug
with pd.ExcelWriter("Farhat_logReg_analysis.xlsx") as file:
   
    for key, val in drug_analyses.items():
        val.to_excel(file, sheet_name=key, index=False)

In [55]:
drug = "Delamanid"
res_df = pd.read_csv(os.path.join(analysis_dir, drug, "final_analysis.csv"))

In [56]:
res_df

Unnamed: 0,orig_variant,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,confidence_WHO_2021,Odds_Ratio,OR_LB,OR_UB,Tier,Phenos,unpooled,synonymous
0,ddn_lof,0.1808,0.1573,0.2015,0.0000,0.0000,0.0000,,1.1982,1.1704,1.2233,2,ALL,0,0
1,PC0,0.1553,0.0518,0.2549,0.0016,0.0123,0.9491,,1.1680,1.0532,1.2903,2,ALL,1,0
2,PC1,0.1460,0.0304,0.2356,0.0024,0.0157,1.0000,,1.1571,1.0308,1.2657,2,ALL,1,0
3,ddn_p.Leu49Pro,0.1377,0.1170,0.1530,0.0000,0.0000,0.0000,2) Assoc w R - Interim,1.1476,1.1241,1.1654,2,ALL,1,0
4,PC4,0.1373,0.0372,0.2337,0.0045,0.0255,1.0000,,1.1472,1.0379,1.2633,2,ALL,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
953,fgd1_c.774G>A,-0.0641,-0.0745,-0.0505,0.0000,0.0000,0.0000,,0.9379,0.9282,0.9507,2,ALL,0,1
954,fbiC_c.967C>T,-0.0646,-0.0758,-0.0524,0.0000,0.0000,0.0000,,0.9374,0.9270,0.9490,2,ALL,0,1
955,fbiB_p.Leu447Arg,-0.0649,-0.0788,-0.0504,0.0000,0.0000,0.0000,3) Uncertain significance,0.9371,0.9242,0.9509,2,ALL,1,0
956,fbiC_c.-11G>A,-0.0680,-0.0857,-0.0481,0.0000,0.0000,0.0000,3) Uncertain significance,0.9342,0.9179,0.9530,2,ALL,1,0


# Catalog-Based Method Comparison

In [None]:
def get_logReg_summary(out_dir):
    
    model_summary = pd.read_csv(os.path.join(out_dir, "logReg_summary.csv"))
    model_analysis = pd.read_csv(os.path.join(out_dir, "model_analysis.csv"))
    
    summary_sens = model_summary.loc[0, "Sens"]
    summary_spec = model_summary.loc[0, "Spec"]
    
    #max_idx = [np.argmax(model_analysis["Sens"]), np.argmax(model_analysis["Spec"])]
    
    better_variant = model_analysis.query("Sens >= @summary_sens & Spec >= @summary_spec")
    
    if len(better_variant) == 0:
        print("No variants have comparable sensitivity AND specificity")
    else:
        print(better_variant)
    return model_summary, model_analysis
    
    #return model_summary, model_analysis.iloc[max_idx, :][["orig_variant", "coef", "Sens", "Spec", "accuracy", "balanced_accuracy"]]

In [None]:
#out_dir = "/n/data1/hms/dbmi/farhat/ye12/who/analysis/Levofloxacin/tiers=1+2/phenos=ALL/dropAF_withSyn"
out_dir = "/n/data1/hms/dbmi/farhat/ye12/who/analysis/Pyrazinamide/tiers=1+2/phenos=WHO/dropAF_withSyn"

summary, analysis = get_logReg_summary(out_dir)
summary

In [None]:
analysis.query("Sens > 0.9006")

In [None]:
analysis.loc[analysis["orig_variant"].str.contains("PC")]

In [None]:
analysis.query("Spec > 0.9575")

In [None]:
out_dir = "/n/data1/hms/dbmi/farhat/ye12/who/analysis/Moxifloxacin/tiers=1+2/phenos=ALL/dropAF_withSyn"

get_logReg_summary(out_dir)

In [None]:
df_pza = pd.read_csv("/n/data1/hms/dbmi/farhat/ye12/who/analysis/Pyrazinamide/tiers=1+2/phenos=WHO/dropAF_withSyn/phenos.csv")

# the missing ones might be M. cannettii, most similar to L6 based on the other lineage callers
lineages = pd.read_pickle("../data/combined_lineage_sample_IDs.pkl")
lineages["Lineage"] = lineages["Lineage"].fillna("6")
lineages["Lineage_1"] = lineages["Lineage_1"].fillna("6")

lineages = lineages[["Sample Name", "Sample ID", "Lineage_1"]]
lineages["Lineage"] = [str(val).split(".")[0] for val in lineages["Lineage_1"].values]
lineages.loc[lineages["Lineage"].str.contains("BOV"), "Lineage"] = "M. bovis"

assert len(lineages.loc[pd.isnull(lineages["Lineage"])]) == 0

########## KEEP ONLY ISOLATES WITH ALL 3 PIECES OF DATA ##########

# get only isolates with data for everyting: SNP matrix, in the model, and lineages
combined = lineages.merge(df_pza, left_on="Sample ID", right_on="sample_id")

In [None]:
combined.groupby("Lineage")["phenotype"].mean().sort_values(ascending=False)

In [None]:
combined.query("Lineage == 'M. bovis'").Lineage_1.unique()