In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 150
import seaborn as sns

import glob, os, yaml, subprocess, itertools
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from statsmodels.distributions.empirical_distribution import ECDF
import sklearn.metrics
from sklearn.decomposition import PCA
import timeit
import scipy.stats as st
import statsmodels
import pickle, yaml, tracemalloc
from scipy.stats import binomtest

analysis_dir = '/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue'
input_data_dir = '/n/data1/hms/dbmi/farhat/ye12/who'
who_variants_combined = pd.read_csv("who_confidence_2021.csv")
lineages = pd.read_csv("../data/lineages.csv", low_memory=False)

import itertools
from stats_utils import *
import warnings
warnings.filterwarnings("ignore")

import vcf, yaml

In [3]:
def combine_unpooled_analyses(drug, phenos_name, folder="BINARY", model_prefix="dropAF_noSyn_unpooled"):
    
    ################## 1. READ IN RIDGE REGRESSION RESULTS ##################
    tier1_model_permute = pd.read_csv(os.path.join(f"{analysis_dir}/{drug}/{folder}/tiers=1/phenos={phenos_name}/{model_prefix}/model_analysis.csv")).query("~mutation.str.contains('PC')")
    tier2_model_permute = pd.read_csv(os.path.join(f"{analysis_dir}/{drug}/{folder}/tiers=1+2/phenos={phenos_name}/{model_prefix}/model_analysis.csv")).query("~mutation.str.contains('PC')")

    # remove the tier 1 genes for the purposes of this analysis
    tier2_model_permute = tier2_model_permute.query("mutation not in @tier1_model_permute.mutation")

    
    ################## 2. READ IN LRT RESULTS ##################
    LRTresults_tier1 = pd.read_csv(os.path.join(f"{analysis_dir}/{drug}/{folder}/tiers=1/phenos={phenos_name}/{model_prefix}/LRT_results.csv")).rename(columns={"Unnamed: 0": "mutation"})
    LRTresults_tier2 = pd.read_csv(os.path.join(f"{analysis_dir}/{drug}/{folder}/tiers=1+2/phenos={phenos_name}/{model_prefix}/LRT_results.csv")).rename(columns={"Unnamed: 0": "mutation"})

    # remove the FULL model row, which is the first row
    LRTresults_tier1 = add_pval_corrections(LRTresults_tier1.iloc[1:, ])
    LRTresults_tier2 = add_pval_corrections(LRTresults_tier2.iloc[1:, ])
    
    # check that all mutations in the permutation dataframe are in the LRT dataframe. The only difference should be the FULL model row
    assert len(set(tier1_model_permute.mutation).symmetric_difference(LRTresults_tier1.mutation)) == 0
    assert len(set(tier2_model_permute.mutation).symmetric_difference(LRTresults_tier2.mutation)) == 0

    # combine results into a single dataframe for easy searching. REMOVE BONFERRONI AND COEFS
    tier1_model = tier1_model_permute[tier1_model_permute.columns[~tier1_model_permute.columns.str.contains("|".join(["Bonferroni", "coef"]))]].merge(LRTresults_tier1[["mutation", "pval", "BH_pval"]].rename(columns={"pval": "LRT_pval", 
                                                                                                                   "BH_pval": "LRT_BH_pval", 
                                                                                                                  }), on="mutation", how="left")

    tier2_model = tier2_model_permute[tier2_model_permute.columns[~tier2_model_permute.columns.str.contains("|".join(["Bonferroni", "coef"]))]].merge(LRTresults_tier2[["mutation", "pval", "BH_pval"]].rename(columns={"pval": "LRT_pval", 
                                                                                                                   "BH_pval": "LRT_BH_pval", 
                                                                                                                  }), on="mutation", how="left")

    ################## 3. READ IN AUC TEST RESULTS ##################
    tier1_auc = pd.read_csv(os.path.join(analysis_dir, drug, folder, "tiers=1/phenos=WHO/dropAF_noSyn_unpooled/AUC_test_results.csv"))
    del tier1_auc["Bonferroni_pval"]
    tier2_auc = pd.read_csv(os.path.join(analysis_dir, drug, folder, "tiers=1+2/phenos=WHO/dropAF_noSyn_unpooled/AUC_test_results.csv"))
    del tier2_auc["Bonferroni_pval"]

    tier1_combined_results = tier1_model.merge(tier1_auc.rename(columns={"pval": "AUC_pval", "BH_pval": "AUC_BH_pval"}), 
                      on="mutation", how="left")

    tier1_combined_results[['Num_Isolates', 'Total_Isolates',
           'TP', 'FP', 'TN', 'FN']] = tier1_combined_results[['Num_Isolates', 'Total_Isolates',
           'TP', 'FP', 'TN', 'FN']].astype(int)

    tier2_combined_results = tier2_model.merge(tier2_auc.rename(columns={"pval": "AUC_pval", "BH_pval": "AUC_BH_pval"}), 
                      on="mutation", how="left")

    tier2_combined_results[['Num_Isolates', 'Total_Isolates',
           'TP', 'FP', 'TN', 'FN']] = tier2_combined_results[['Num_Isolates', 'Total_Isolates',
           'TP', 'FP', 'TN', 'FN']].astype(int)

    tier1_combined_results["Tier"] = 1
    tier2_combined_results["Tier"] = 2

    # columns to return, in the desired order
    keep_cols = ['mutation', 'Tier', 'predicted_effect', 'position', 'confidence', 'Odds_Ratio',
                           'OR_LB', 'OR_UB', 'pval', 'BH_pval', 'LRT_pval', 'LRT_BH_pval', 
                            'AUC_diff', 'AUC_pval', 'AUC_BH_pval', 
                            'Num_Isolates', 'Total_Isolates',
                           'TP', 'FP', 'TN', 'FN', 'PPV', 'NPV', 'Sens', 'Spec', 'LR+', 'LR-',
                           'PPV_LB', 'PPV_UB', 'NPV_LB', 'NPV_UB', 'Sens_LB', 'Sens_UB', 'Spec_LB',
                           'Spec_UB', 'LR+_LB', 'LR+_UB', 'LR-_LB', 'LR-_UB'
                           ]
    
    # combine Tier 1 and 2 results, and return the result
    final = pd.concat([tier1_combined_results[keep_cols],
                      tier2_combined_results[keep_cols]
                      ], axis=0
                     )
    
    print("Tier 1:")
    print(f'    Ridge: {len(final.query("OR_LB > 1 & Tier == 1 & BH_pval < 0.05"))}')
    print(f'    LRT: {len(final.query("OR_LB > 1 & Tier == 1 & LRT_BH_pval < 0.05"))}')
    print(f'    AUC: {len(final.query("OR_LB > 1 & Tier == 1 & AUC_BH_pval < 0.05"))}')
    
    print("Tier 2:")
    print(f'    Ridge: {len(final.query("OR_LB > 1 & Tier == 2 & BH_pval < 0.01"))}')
    print(f'    LRT: {len(final.query("OR_LB > 1 & Tier == 2 & LRT_BH_pval < 0.01"))}')
    print(f'    AUC: {len(final.query("OR_LB > 1 & Tier == 2 & AUC_BH_pval < 0.01"))}')

    return final

In [12]:
phenos_name = "WHO"

final_RIF = combine_unpooled_analyses("Rifampicin", phenos_name)

Tier 1:
    Ridge: 86
    LRT: 41
    AUC: 23
Tier 2:
    Ridge: 68
    LRT: 2
    AUC: 1


In [4]:
final_RIF = pd.read_csv("/home/sak0914/RIF_unpooled_results.csv")

In [13]:
final_RIF.query("Tier==1 & OR_LB > 1 & BH_pval > 0.05 & LRT_BH_pval < 0.05")

Unnamed: 0,mutation,Tier,predicted_effect,position,confidence,Odds_Ratio,OR_LB,OR_UB,pval,BH_pval,...,NPV_LB,NPV_UB,Sens_LB,Sens_UB,Spec_LB,Spec_UB,LR+_LB,LR+_UB,LR-_LB,LR-_UB


In [14]:
final_RIF.query("Tier==2 & OR_LB > 1 & BH_pval > 0.01 & LRT_BH_pval < 0.01")

Unnamed: 0,mutation,Tier,predicted_effect,position,confidence,Odds_Ratio,OR_LB,OR_UB,pval,BH_pval,...,NPV_LB,NPV_UB,Sens_LB,Sens_UB,Spec_LB,Spec_UB,LR+_LB,LR+_UB,LR-_LB,LR-_UB


In [7]:
final_RIF.query("AUC_diff<0").AUC_BH_pval.min()

0.8078787878787879

In [11]:
final_RIF.query("AUC_diff<0")[["mutation", "AUC_diff", "AUC_pval", "AUC_BH_pval", "Tier", "confidence"]]

Unnamed: 0,mutation,AUC_diff,AUC_pval,AUC_BH_pval,Tier,confidence
57,rpoB_p.Thr427Ala,-1.140565e-06,0.33,0.809412,1,2) Assoc w R - Interim
95,rpoB_p.Leu430Arg,-5.079705e-06,0.31,0.807879,1,2) Assoc w R - Interim
815,rpoC_p.Val483Gly,-0.000122302,0.06,0.81194,2,
835,rpoC_p.Asn826Thr,-2.549304e-05,0.07,0.81194,2,
838,rpoC_p.Val1039Ala,-1.519988e-07,0.23,0.81194,2,
840,rpoC_p.Leu507Val,-3.74772e-06,0.22,0.81194,2,
846,rpoA_p.Gly31Ser,-3.472222e-06,0.21,0.81194,2,
849,rpoC_p.Phe452Cys,-1.496238e-06,0.21,0.81194,2,
858,rpoC_p.Leu746Val,-1.472488e-07,0.24,0.81194,2,
867,rpoC_p.Val1252Met,-1.543737e-06,0.2,0.81194,2,


# 2. AUC Change Test for Mutations that are Significant in EITHER Ridge or LRT

Tier 1: 86 mutations

Tier 2: 68 mutations

In [75]:
# rif_results = pd.read_excel("../results/BINARY/Rifampicin.xlsx", sheet_name=["Model_3", "Model_7"])

# # mutation renaming, that's all
# set(rif_results["Model_7"].mutation) - set(final.mutation)
# set(final.query("Tier==2").mutation) - set(rif_results["Model_7"].mutation)

In [221]:
thresh = 0.05
print(len(tier1_combined_results.query("OR_LB > 1 & BH_pval < @thresh")))
print(len(tier1_combined_results.query("OR_LB > 1 & LRT_BH_pval < @thresh")))
print(len(tier1_combined_results.query("OR_LB > 1 & AUC_BH_pval < @thresh")))

86
41
23


In [35]:
thresh = 0.01
print(len(tier2_combined_results.query("OR_LB > 1 & BH_pval < @thresh")))
print(len(tier2_combined_results.query("OR_LB > 1 & LRT_BH_pval < @thresh")))
print(len(tier2_combined_results.query("OR_LB > 1 & AUC_BH_pval < @thresh")))

68
2
1


In [36]:
tier2_combined_results.query("OR_LB > 1 & LRT_BH_pval < @thresh")

Unnamed: 0,mutation,predicted_effect,position,confidence,Odds_Ratio,OR_LB,OR_UB,pval,BH_pval,Num_Isolates,...,Spec_UB,LR+_LB,LR+_UB,LR-_LB,LR-_UB,LRT_pval,LRT_BH_pval,AUC_diff,AUC_pval,AUC_BH_pval
2,rpoC_p.Glu1092Asp,missense_variant,766645,5) Not assoc w R,1.216332,1.139563,1.296843,0.0,0.0,2697,...,0.955703,3.348102,3.887383,0.862922,0.879198,5.033098e-07,0.000716,0.000649,0.1,0.81194
19,Rv2752c_p.Asn30Ser,missense_variant,3066103,3) Uncertain significance,1.105124,1.050165,1.166855,0.0,0.0,12,...,0.999895,0.67069,6.44448,0.999161,1.00022,1.480226e-07,0.000421,0.00022,0.0,0.0


In [37]:
tier2_combined_results.query("OR_LB > 1 & AUC_BH_pval < @thresh")

Unnamed: 0,mutation,predicted_effect,position,confidence,Odds_Ratio,OR_LB,OR_UB,pval,BH_pval,Num_Isolates,...,Spec_UB,LR+_LB,LR+_UB,LR-_LB,LR-_UB,LRT_pval,LRT_BH_pval,AUC_diff,AUC_pval,AUC_BH_pval
19,Rv2752c_p.Asn30Ser,missense_variant,3066103,3) Uncertain significance,1.105124,1.050165,1.166855,0.0,0.0,12,...,0.999895,0.67069,6.44448,0.999161,1.00022,1.480226e-07,0.000421,0.00022,0.0,0.0


In [236]:
thresh = 0.05
tier1_combined_results.query("OR_LB > 1 & LRT_BH_pval < @thresh & BH_pval > @thresh")

Unnamed: 0,mutation,predicted_effect,position,confidence,Odds_Ratio,OR_LB,OR_UB,pval,BH_pval,Num_Isolates,...,Spec_UB,LR+_LB,LR+_UB,LR-_LB,LR-_UB,LRT_pval,LRT_BH_pval,AUC_diff,AUC_pval,AUC_BH_pval


In [239]:
thresh = 0.01
tier2_model.query("OR_LB > 1 & LRT_BH_pval < @thresh")[["mutation", "PPV_LB", "PPV_UB"]]

Unnamed: 0,mutation,PPV_LB,PPV_UB
2,rpoC_p.Glu1092Asp,0.615909,0.652616
19,Rv2752c_p.Asn30Ser,0.210945,0.789055


In [240]:
3609-815

2794

In [235]:
thresh = 0.01
print(len(tier2_model.query("OR_LB > 1 & BH_pval < @thresh")))
print(len(tier2_model.query("OR_LB > 1 & LRT_BH_pval < @thresh")))
# print(len(tier1_combined_results.query("OR_LB > 1 & AUC_BH_pval < @thresh")))

68
2


In [227]:
thresh = 0.05
tier1_combined_results.query("OR_LB > 1 & (LRT_BH_pval < @thresh | BH_pval < @thresh)").shape

(86, 38)

In [228]:
set(tier1_combined_results.query("OR_LB > 1 & LRT_BH_pval < @thresh").mutation) - set(tier1_combined_results.query("OR_LB > 1 & BH_pval < @thresh").mutation)

set()

In [231]:
thresh = 0.01
set(tier2_model.query("OR_LB > 1 & LRT_BH_pval < @thresh").mutation) - set(tier2_model.query("OR_LB > 1 & BH_pval < @thresh").mutation)

set()

In [234]:
thresh = 0.01
len(set(tier2_model.query("OR_LB > 1 & BH_pval < @thresh").mutation) - set(tier2_model.query("OR_LB > 1 & LRT_BH_pval < @thresh").mutation))

66

# Final Prediction Model: For Rifampicin, Adding tier 2 genes doesn't significantly improve binary metrics

In [12]:
phenos_name = "WHO"
tiers_lst = ["1"]
predict_model_tier1 = pd.read_csv(os.path.join(analysis_dir, drug, f"tiers={'+'.join(tiers_lst)}_phenos={phenos_name}_CV_results.csv"))

tiers_lst = ["1", "2"]
predict_model_tiers12 = pd.read_csv(os.path.join(analysis_dir, drug, f"tiers={'+'.join(tiers_lst)}_phenos={phenos_name}_CV_results.csv"))

pd.DataFrame({"Metric": predict_model_tier1.columns,
              "pval": st.ttest_ind(predict_model_tier1, predict_model_tiers12, axis=0, equal_var=False, alternative='two-sided')[1]
             })

Unnamed: 0,Metric,pval
0,test_roc_auc,0.868314
1,test_accuracy,0.981951
2,test_balanced_accuracy,0.933191
3,test_spec,0.918947
4,test_sens,0.884981


# 3. Lineage-Stratified Analyses

In [163]:
lineage_dir = os.path.join(analysis_dir, drug, "BINARY/lineage_models")
print(os.listdir(lineage_dir))

print(os.listdir(os.path.join(lineage_dir, "L4")))

['L2', 'L3', 'L4', 'L1', 'L2.2.1']
['model.sav', 'regression_coef.csv']


In [241]:
L2_2_1 = pd.read_csv(os.path.join(lineage_dir, "L2.2.1", "model_analysis.csv"))
L2 = pd.read_csv(os.path.join(lineage_dir, "L2", "model_analysis.csv"))
L1 = pd.read_csv(os.path.join(lineage_dir, "L1", "model_analysis.csv"))
L3 = pd.read_csv(os.path.join(lineage_dir, "L3", "model_analysis.csv"))
L4 = pd.read_csv(os.path.join(lineage_dir, "L4", "model_analysis.csv"))

In [187]:
config_file = "../config_files/binary_07.yaml"
kwargs = yaml.safe_load(open(config_file))
binary = kwargs["binary"]
tiers_lst = kwargs["tiers_lst"]
synonymous = kwargs["synonymous"]
alpha = kwargs["alpha"]
model_prefix = kwargs["model_prefix"]
pheno_category_lst = kwargs["pheno_category_lst"]
atu_analysis = kwargs["atu_analysis"]
atu_analysis_type = kwargs["atu_analysis_type"]
analysis_dir = kwargs["output_dir"]
num_PCs = kwargs["num_PCs"]
num_bootstrap = kwargs["num_bootstrap"]

if "ALL" in pheno_category_lst:
    phenos_name = "ALL"
else:
    phenos_name = "WHO"

scaler = StandardScaler()
    
out_dir = os.path.join(analysis_dir, drug, f"BINARY/lineage_models/L{lineage}")

if not os.path.isdir(out_dir):
    os.makedirs(out_dir)
    
    lineage = "2.2.1"
# no model (basically just for Pretomanid because there are no WHO phenotypes, so some models don't exist)
matrix = pd.read_pickle(os.path.join(analysis_dir, drug, "BINARY", f"tiers={'+'.join(tiers_lst)}", f"phenos={phenos_name}", model_prefix, "model_matrix.pkl"))
df_phenos = pd.read_csv(os.path.join(analysis_dir, drug, "phenos_binary.csv")).set_index("sample_id")

# read in eigenvectors files, which was previously computed, and keep only the desired number of PCs
eigenvec_df = pd.read_csv("../data/eigenvec_10PC.csv", index_col=[0]).iloc[:, :num_PCs]

# keep only the samples that are in this model, then concatenate the eigenvectors to the matrix
eigenvec_df = eigenvec_df.loc[matrix.index]
matrix = matrix.merge(eigenvec_df, left_index=True, right_index=True, how="inner")
print(matrix.shape)

# keep only the specified lineage
if len(lineage) == 1:
    lineage_samples = lineages.query("Primary_Lineage == @lineage")["Sample_ID"].unique()
else:
    lineage_samples = []

    for i, row in lineages.iterrows():
        if lineage in row["Lineage"].split(","):
            lineage_samples.append(row["Sample_ID"])
    lineage_samples = np.unique(lineage_samples)
        
matrix = matrix.loc[matrix.index.isin(lineage_samples)]
matrix = matrix[matrix.columns[~((matrix == 0).all())]]

matrix = matrix.merge(df_phenos, left_index=True, right_index=True, how="left")
print(matrix.shape)

(30984, 3614)
(7714, 953)


In [194]:
TP = len(matrix.loc[(matrix["rpoC_p.Glu1092Asp"] == 1) & (matrix["phenotype"] == 1)])
FP = len(matrix.loc[(matrix["rpoC_p.Glu1092Asp"] == 1) & (matrix["phenotype"] == 0)])
TN = len(matrix.loc[(matrix["rpoC_p.Glu1092Asp"] == 0) & (matrix["phenotype"] == 0)])
FN = len(matrix.loc[(matrix["rpoC_p.Glu1092Asp"] == 0) & (matrix["phenotype"] == 1)])

TP, FP, TN, FN

(1708, 986, 2501, 2519)

In [196]:
# 40% of R samples have the variant
TP / (TP + FN)

0.4040690797255737

In [197]:
# 28% of S samples have the variant
FP / (FP + TN)

0.28276455405792944

In [210]:
tp, fp, tn, fn = tier2_model.query("mutation=='rpoC_p.Glu1092Asp'")[["TP", "FP", "TN", "FN"]].values[0].astype(int)
tp, fp, tn, fn

(1711, 986, 19935, 8352)

In [211]:
# 17% of R samples have the variant
tp / (tp + fn)

0.17002881844380405

In [212]:
# 5% of S samples have the variant
fp / (fp + tn)

0.04712967831365614

In [171]:
L2_2_1.query("BH_pval < 0.01 & OR_LB > 1").confidence.value_counts(dropna=False)

NaN                          29
1) Assoc w R                 20
3) Uncertain significance    13
2) Assoc w R - Interim        3
5) Not assoc w R              1
Name: confidence, dtype: int64

In [247]:
L2_2_1.query("mutation=='rpoC_p.Glu1092Asp'")

Unnamed: 0,mutation,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,Odds_Ratio,OR_LB,OR_UB,confidence
12,rpoC_p.Glu1092Asp,0.349582,0.228614,0.482548,0.0,0.0,0.0,1.418475,1.256856,1.620198,5) Not assoc w R


In [248]:
L2.query("mutation=='rpoC_p.Glu1092Asp'")

Unnamed: 0,mutation,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,Odds_Ratio,OR_LB,OR_UB,confidence
12,rpoC_p.Glu1092Asp,0.333053,0.227734,0.453529,0.0,0.0,0.0,1.395222,1.255752,1.573856,5) Not assoc w R


In [245]:
L2_2_1.to_csv("/home/sak0914/L2.2.1_regression.csv", index=False)

In [246]:
L2.to_csv("/home/sak0914/L2_regression.csv", index=False)

In [160]:
L2.query("mutation=='rpoC_p.Glu1092Asp'")

Unnamed: 0,mutation,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,Odds_Ratio,OR_LB,OR_UB,confidence
12,rpoC_p.Glu1092Asp,0.333053,0.227734,0.453529,0.0,0.0,0.0,1.395222,1.255752,1.573856,5) Not assoc w R


In [242]:
L4.query("mutation=='rpoC_p.Glu1092Asp'")

Unnamed: 0,mutation,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,Odds_Ratio,OR_LB,OR_UB,confidence
572,rpoC_p.Glu1092Asp,0.014565,-1.013331e-16,0.066168,0.238,0.592357,1.0,1.014672,1.0,1.068406,5) Not assoc w R


# 4. Model with Interaction Terms

Create Tier 1 + Tier 2 interaction terms for mutations that are significant in both the Ridge regression and the LRT

Chose 2 variants because they were the only Tier 2 genes that were significantly associated in the Ridge regression and were significant in the LRT

In [17]:
thresh = 0.01
tier2_model.query('BH_pval < @thresh & LRT_BH_pval < @thresh')

Unnamed: 0,mutation,coef,coef_LB,coef_UB,pval,BH_pval,Bonferroni_pval,Odds_Ratio,OR_LB,OR_UB,confidence,LRT_pval,LRT_BH_pval,LRT_Bonferroni_pval
1,rpoC_p.Glu1092Asp,0.212831,0.147201,0.282702,0.0,0.0,0.0,1.237176,1.158587,1.32671,5) Not assoc w R,5.033098e-07,0.000716,0.001432
20,Rv2752c_p.Asn30Ser,0.099783,0.046205,0.149246,0.0,0.0,0.0,1.104931,1.047289,1.160958,3) Uncertain significance,1.480226e-07,0.000421,0.000421


In [18]:
thresh = 0.01
interact_mut = tier2_model.query('BH_pval < @thresh & LRT_BH_pval < @thresh')["mutation"].values
df_phenos = pd.read_csv(os.path.join(analysis_dir, drug, "phenos_binary.csv")).set_index("sample_id")
#interact_mut = ['rpoC_p.Glu1092Asp', 'Rv2752c_p.Asn30Ser']

# model_matrix = pd.read_pickle(os.path.join(analysis_dir, drug, "BINARY/tiers=1+2/phenos=WHO/dropAF_noSyn_unpooled/model_matrix.pkl"))
# interact_matrix = model_matrix.copy()

# # get a list of all pairs of mutations to interact
# interactions = list(itertools.product(*[interact_mut, interact_matrix.columns[interact_matrix.columns.str.contains("rpoB")]]))

# # add interactions
# for (mut1, mut2) in interactions:
#     interact_matrix[f"{mut1}+{mut2}"] = interact_matrix[mut1] * interact_matrix[mut2]
    
# # drop any columns with no signal
# interact_matrix = interact_matrix[interact_matrix.columns[~((interact_matrix == 0).all())]]

# assert len(set(model_matrix.columns) - set(interact_matrix.columns)) == 0
# print(len(interact_matrix), len(model_matrix))

# interact_matrix.to_pickle(os.path.join(analysis_dir, drug, f"BINARY/interaction/model_matrix_{phenos_name}.pkl"))

interact_matrix = pd.read_pickle(os.path.join(analysis_dir, drug, f"BINARY/interaction/model_matrix_{phenos_name}.pkl"))

interact_matrix = interact_matrix.merge(df_phenos, left_index=True, right_index=True, how="left")
del interact_matrix["phenotypic_category"]

interact_matrix.shape

(30984, 3737)

In [19]:
# so much co-occurrence of rpoC_p.Glu1092Asp with rpoB mutations. Can also check proportion that are high confidence rpoB mutations
for mut in interact_mut:
    print(mut, interact_matrix[interact_matrix.columns[interact_matrix.columns.str.contains(mut)]].shape[1])

rpoC_p.Glu1092Asp 125
Rv2752c_p.Asn30Ser 4


In [63]:
phenos_name = "WHO"
interact_res = pd.read_csv(os.path.join(analysis_dir, drug, f"BINARY/interaction/{phenos_name}_coef.csv"))
interact_permute = pd.read_csv(os.path.join(analysis_dir, drug, f"BINARY/interaction/{phenos_name}_coef_permutation.csv"))

# assess significance using the results of the permutation test
for i, row in interact_res.iterrows():
    # p-value is the proportion of permutation coefficients that are AT LEAST AS EXTREME as the test statistic
    if row["coef"] > 0:
        interact_res.loc[i, "pval"] = np.mean(interact_permute[row["mutation"]] >= row["coef"])
    else:
        interact_res.loc[i, "pval"] = np.mean(interact_permute[row["mutation"]] <= row["coef"])
        
interact_res = add_pval_corrections(interact_res)
interact_res["Odds_Ratio"] = np.exp(interact_res["coef"])

interact_res[["Mut1", "Mut2"]] = interact_res["mutation"].str.split("+", expand=True)
interact_res = interact_res.merge(who_variants_combined.query("drug=='RIF'"), left_on="Mut2", right_on="mutation", how="left")
del interact_res["mutation_y"]
del interact_res["drug"]
interact_res.rename(columns={"mutation_x": "mutation"}, inplace=True)

In [68]:
interact_res.query("mutation.str.contains('rpoC_p.Glu1092Asp') & BH_pval < 0.05").sort_values(["BH_pval", "Odds_Ratio"], 
                                                                                           ascending=[True, False]
                                                                                          )

Unnamed: 0,mutation,coef,pval,BH_pval,Bonferroni_pval,Odds_Ratio,Mut1,Mut2,confidence
3206,rpoC_p.Glu1092Asp,0.219309,0.0,0.0,0.0,1.245216,rpoC_p.Glu1092Asp,,
3676,rpoC_p.Glu1092Asp+rpoB_p.Ile491Tyr,0.026609,0.0,0.0,0.0,1.026966,rpoC_p.Glu1092Asp,rpoB_p.Ile491Tyr,3) Uncertain significance
3693,rpoC_p.Glu1092Asp+rpoB_p.Met434_Asn437delinsIle,0.018817,0.0,0.0,0.0,1.018995,rpoC_p.Glu1092Asp,rpoB_p.Met434_Asn437delinsIle,2) Assoc w R - Interim
3623,rpoC_p.Glu1092Asp+rpoB_p.Asn260Asp,-0.027974,0.0,0.0,0.0,0.972414,rpoC_p.Glu1092Asp,rpoB_p.Asn260Asp,
3725,rpoC_p.Glu1092Asp+rpoB_p.Val170Phe,-0.057892,0.003,0.0387,1.0,0.943752,rpoC_p.Glu1092Asp,rpoB_p.Val170Phe,1) Assoc w R
3708,rpoC_p.Glu1092Asp+rpoB_p.Ser450Leu,0.05754,0.004,0.048584,1.0,1.059228,rpoC_p.Glu1092Asp,rpoB_p.Ser450Leu,1) Assoc w R


In [70]:
interact_res.query("mutation.str.contains('rpoB_p.Val170Phe')")

Unnamed: 0,mutation,coef,pval,BH_pval,Bonferroni_pval,Odds_Ratio,Mut1,Mut2,confidence
2936,rpoB_p.Val170Phe,0.191617,0.0,0.0,0.0,1.211206,rpoB_p.Val170Phe,,
3725,rpoC_p.Glu1092Asp+rpoB_p.Val170Phe,-0.057892,0.003,0.0387,1.0,0.943752,rpoC_p.Glu1092Asp,rpoB_p.Val170Phe,1) Assoc w R


In [67]:
interact_res.query("mutation.str.contains('rpoB_p.Asn260Asp')")

Unnamed: 0,mutation,coef,pval,BH_pval,Bonferroni_pval,Odds_Ratio,Mut1,Mut2,confidence
2395,rpoB_p.Asn260Asp,-0.027974,0.0,0.0,0.0,0.972414,rpoB_p.Asn260Asp,,
3623,rpoC_p.Glu1092Asp+rpoB_p.Asn260Asp,-0.027974,0.0,0.0,0.0,0.972414,rpoC_p.Glu1092Asp,rpoB_p.Asn260Asp,


In [217]:
interact_matrix.groupby("phenotype")['rpoC_p.Glu1092Asp+rpoB_p.Ser450Leu'].value_counts()

phenotype  rpoC_p.Glu1092Asp+rpoB_p.Ser450Leu
0          0.0                                   20907
           1.0                                      14
1          0.0                                    8682
           1.0                                    1381
Name: rpoC_p.Glu1092Asp+rpoB_p.Ser450Leu, dtype: int64

In [224]:
interact_matrix['rpoC_p.Glu1092Asp+rpoB_p.Ser450Leu'].mean()

0.045023237800154915

In [218]:
1381 / 1385

0.9971119133574007

In [223]:
interact_matrix['rpoC_p.Glu1092Asp'].mean()

0.08704492641363284

In [222]:
interact_matrix['rpoB_p.Ser450Leu'].mean()

0.21449780531887425

In [34]:
os.path.join()

'/n/data1/hms/dbmi/farhat/Sanjana/who-mutation-catalogue/Rifampicin/BINARY/tiers=1/phenos=WHO/dropAF_noSyn_unpooled'