In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from copy import deepcopy
import time
import random
from scipy import stats
from scipy.stats import mode
import seaborn as sns
import warnings
from collections import defaultdict
warnings.filterwarnings("ignore")

## Set B indicates SIMULTANEOUS MUTATIONS ONLY

### Functions

In [3]:
def beta_p(a_mutated_branches, b_mutated_branches, 
           prior_mean, PSEUDOCOUNT_VALUE = 1, verbose=False):
    
    """
        Computes p-value of the inferred mu from the non-mutated branches versus the beta
    distribution inferred from the mutated branches
    
    Parameters
    ----------
    a_mutated_branches: int
        number of background mutated branches with a mutation event
    b_mutated_branches: int
        number of background mutated branches WITHOUT a mutation event
    prior_mean: float
        expected number of same branch mutations if mutation i and j are independent
    PSEUDOCOUNT_VALUE: int, optional (default: 1)
        because beta distribution parameters must be greater than 0, pseudocount
        value to introduce
    verbose: bool, optional (defualt: False)
        if true, print debugging statements
        
    Note that "a" and "b" here refer to the input parameters of the beta distribution, alpha and beta 

    
    """
    #pseudocounting
    
    if a_mutated_branches ==0:
        return np.nan
    if b_mutated_branches ==0:
        b_mutated_branches += PSEUDOCOUNT_VALUE

    beta_posterior = stats.beta(a=a_mutated_branches, b=b_mutated_branches)
    
    pval = beta_posterior.cdf(prior_mean)
    if verbose:
        print(f"a_mutated_branches: {a_mutated_branches}")
        print(f"b_mutated_branches: {b_mutated_branches}")
        print(f"expected ps: {prior_mean}")
        print(f"pvalue:{pval}")
        
    return pval

def apply_testing(df, total_branches=None, verbose=False, run_hypergeometric=False, run_betabin=False):
    """
    Function to apply the statistical tests to the dataframe of co-ocurring mutations
    
    Applies the test in both directions, ie tests mutation A -> mutation B and mutation B -> mutation A
    """
    if total_branches is None:
        total_branches = len(np.load("output/00.mutation_data_by_lineage/00.combined_data/all_branches.npy"))
        print(total_branches)
    
    df["total_branches_applicable"] = total_branches - df.n_branches_with_i - df.n_branches_with_j + df.n_branch_i_and_j -\
                                    df.n_mutations_i - df.n_mutations_j + df.n_muts_same_branch
    
    print('running beta i on j')
    df['pval_beta_i_on_j'] = df.apply(lambda row: beta_p (
        row['n_muts_same_branch'], 
        row["total_branches_applicable"], 
        row.n_mutations_i * row.n_mutations_j / (row.total_branches_applicable * row.total_branches_applicable),
        verbose=verbose
    ), axis = 1) 
    df["pval_beta_j_on_i"] = df["pval_beta_i_on_j"]
    
    return df

### Compute FDR control using Benjamini-Hochberg

def benjamini_hochberg(df, pval_column, threshold = 0.001):
    df = df.sort_values(pval_column)
    M = len(df)
    df["_rank"] = [x for x in range(len(df))]
    df["BH_critical_value"] = [threshold * x/M for x in df._rank]
    df[pval_column+"_BH_sig"] = [p<=x for p,x in zip(df[pval_column], df.BH_critical_value)]
    df = df.drop(["_rank"], axis=1)
    return df
def holms(df, pval_column, threshold = 0.001):
    df = df.sort_values(pval_column)
    output = multipletests(df[pval_column], threshold, method="hs")
    df[pval_column+"_BH_sig"] = output[0]
    return df


def unravel_ij(df):
    """
    Takes the dataframe of mutation A, mutation B pairs, and doubles the size, so that
    mutation A -> mutation B and mutation B -> mutation A each get their own rows. 
    IE, now directionality matters
    """
    
    reversal_df = deepcopy(df)
    reversal_df = reversal_df.rename({
        "position_i":"position_j",
        "position_j":"position_i",
        "n_mutations_i":"n_mutations_j",
        "n_mutations_j":"n_mutations_i",
        "n_branches_with_i":"n_branches_with_j",
        "n_branches_with_j":"n_branches_with_i",
        "n_muts_i_on_branch_j":"n_muts_j_on_branch_i",
        "n_muts_j_on_branch_i":"n_muts_i_on_branch_j",
        "n_branch_with_i_no_j":"n_branch_with_j_no_i",
        "n_branch_with_j_no_i":"n_branch_with_i_no_j",
        "n_unmutated_branches_i":"n_unmutated_branches_j",
        "n_unmutated_branches_j":"n_unmutated_branches_i",
        "n_mutations_i_on_unmutated":"n_mutations_j_on_unmutated",
        "n_mutations_j_on_unmutated":"n_mutations_i_on_unmutated",
        "pval_betabin_j_on_i":"pval_betabin_i_on_j",
        "pval_betabin_i_on_j": "pval_betabin_j_on_i",
        "pval_beta_j_on_i":"pval_beta_i_on_j",
        "pval_beta_i_on_j":"pval_beta_j_on_i",
    },axis=1)

    df = pd.concat([df,reversal_df])
    
    df = df[['position_i', 'position_j', 
             'n_mutations_i', 'n_mutations_j', 
             'n_branches_with_i', 'n_branches_with_j', 
             'n_muts_i_on_branch_j', 'n_muts_j_on_branch_i',
             "n_muts_same_branch",
             'n_branch_with_i_no_j', 'n_branch_with_j_no_i', 'n_branch_i_and_j',
             'pval_beta_i_on_j'
            ]]
    return df

## Apply Testing

In [4]:
!mkdir output/02B.comutation_pvalues

mkdir: cannot create directory ‘output/02B.comutation_pvalues’: File exists


In [5]:
# Run the calculation for all the points in the dataset
df = pd.read_csv("output/01.mutation_and_comutation_data/combined_position.csv", index_col=0)
print(len(df)*2)
df = df.query("n_muts_same_branch > 0")
print(len(df))
#df = df.query("position_j==2030521 and position_i==2030472")
#print(df)

subset=apply_testing(df, verbose=False)

results = subset
print(len(results), len(subset))
results.to_csv("output/02B.comutation_pvalues/intermediate_results_unfiltered.csv")


1959198
152224
62846
running beta i on j
152224 152224


In [6]:
results.sort_values("pval_beta_i_on_j")

Unnamed: 0,position_i,mat_index_i,mat_index_i.1,position_j,mat_index_j,mat_index_j.1,n_mutations_i,n_mutations_j,n_reversions_i,n_reversions_j,...,n_branches_with_j,n_muts_i_on_branch_j,n_muts_j_on_branch_i,n_muts_same_branch,n_branch_with_i_no_j,n_branch_with_j_no_i,n_branch_i_and_j,total_branches_applicable,pval_beta_i_on_j,pval_beta_j_on_i
84198,105060,231,231,105063,232,232,345,333,1,1,...,4418,0,0,324,82,8,4410,57992,0.000000e+00,0.000000e+00
293110,761155,849,849,2155168,2388,2388,1487,1811,140,51,...,12564,246,64,569,1758,5157,7407,45795,0.000000e+00,0.000000e+00
565936,1341114,1609,1609,1341120,1610,1610,181,218,4,5,...,3564,0,0,164,82,442,3122,58965,4.940656e-323,4.940656e-323
777947,2197331,2519,2519,2197332,2520,2520,123,125,0,0,...,630,1,0,121,0,2,628,62089,4.850264e-275,4.850264e-275
946785,3847215,3889,3889,3847221,3890,3890,149,134,0,0,...,3490,0,3,125,354,14,3476,58844,9.116068e-269,9.116068e-269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419228,1094538,1288,1288,1673425,1953,1953,254,752,0,9,...,3817,8,38,1,2188,3747,70,55836,9.673196e-01,9.673196e-01
741027,2155168,2388,2388,2626108,2989,2989,1811,39,51,9,...,45234,1263,2,2,2491,35161,10073,13273,9.691434e-01,9.691434e-01
293658,761155,849,849,2626108,2989,2989,1487,39,140,9,...,45234,1155,0,1,1727,37796,7438,14360,9.823863e-01,9.823863e-01
291769,761155,849,849,761161,851,851,1487,153,140,1,...,304,5,26,1,9165,304,0,51738,9.876926e-01,9.876926e-01


In [10]:
results = pd.read_csv("output/02B.comutation_pvalues/intermediate_results_unfiltered.csv", index_col=0)
results = results[[
             'position_i', 'position_j', 
             'n_mutations_i', 'n_mutations_j', 
             'n_branches_with_i', 'n_branches_with_j', 
             'n_muts_i_on_branch_j', 'n_muts_j_on_branch_i',
             "n_muts_same_branch",
             'n_branch_with_i_no_j', 'n_branch_with_j_no_i', 'n_branch_i_and_j',
             'pval_beta_i_on_j'
            ]]


results = benjamini_hochberg(results, "pval_beta_i_on_j", threshold=0.01)

results.to_csv("output/02B.comutation_pvalues/results_file_allpairs.csv")

In [11]:
results.sort_values("pval_beta_i_on_j")

Unnamed: 0,position_i,position_j,n_mutations_i,n_mutations_j,n_branches_with_i,n_branches_with_j,n_muts_i_on_branch_j,n_muts_j_on_branch_i,n_muts_same_branch,n_branch_with_i_no_j,n_branch_with_j_no_i,n_branch_i_and_j,pval_beta_i_on_j,BH_critical_value,pval_beta_i_on_j_BH_sig
84198,105060,105063,345,333,4492,4418,0,0,324,82,8,4410,0.000000e+00,0.000000e+00,True
293110,761155,2155168,1487,1811,9165,12564,246,64,569,1758,5157,7407,0.000000e+00,6.569266e-08,True
565936,1341114,1341120,181,218,3204,3564,0,0,164,82,442,3122,4.940656e-323,1.313853e-07,True
777947,2197331,2197332,123,125,628,630,1,0,121,0,2,628,4.850264e-275,1.970780e-07,True
946785,3847215,3847221,149,134,3830,3490,0,3,125,354,14,3476,9.116068e-269,2.627707e-07,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419228,1094538,1673425,254,752,2258,3817,8,38,1,2188,3747,70,9.673196e-01,9.999672e-03,False
741027,2155168,2626108,1811,39,12564,45234,1263,2,2,2491,35161,10073,9.691434e-01,9.999737e-03,False
293658,761155,2626108,1487,39,9165,45234,1155,0,1,1727,37796,7438,9.823863e-01,9.999803e-03,False
291769,761155,761161,1487,153,9165,304,5,26,1,9165,304,0,9.876926e-01,9.999869e-03,False


In [12]:
results.query("pval_beta_i_on_j_BH_sig")

Unnamed: 0,position_i,position_j,n_mutations_i,n_mutations_j,n_branches_with_i,n_branches_with_j,n_muts_i_on_branch_j,n_muts_j_on_branch_i,n_muts_same_branch,n_branch_with_i_no_j,n_branch_with_j_no_i,n_branch_i_and_j,pval_beta_i_on_j,BH_critical_value,pval_beta_i_on_j_BH_sig
84198,105060,105063,345,333,4492,4418,0,0,324,82,8,4410,0.000000e+00,0.000000e+00,True
293110,761155,2155168,1487,1811,9165,12564,246,64,569,1758,5157,7407,0.000000e+00,6.569266e-08,True
565936,1341114,1341120,181,218,3204,3564,0,0,164,82,442,3122,4.940656e-323,1.313853e-07,True
777947,2197331,2197332,123,125,628,630,1,0,121,0,2,628,4.850264e-275,1.970780e-07,True
946785,3847215,3847221,149,134,3830,3490,0,3,125,354,14,3476,9.116068e-269,2.627707e-07,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530297,1340105,1340389,24,12,666,16,0,1,1,664,14,2,4.624641e-03,4.625815e-03,True
888402,2952401,3894818,5,58,40,185,0,0,1,40,185,0,4.624896e-03,4.625880e-03,True
867216,2719057,3894818,5,58,44,185,0,0,1,44,185,0,4.625191e-03,4.625946e-03,True
161711,332987,333406,48,6,108,568,0,0,1,108,568,0,4.625681e-03,4.626012e-03,True
