In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from copy import deepcopy
import time
import random
from scipy import stats
from scipy.stats import mode
import seaborn as sns
import warnings
from collections import defaultdict
warnings.filterwarnings("ignore")

## Set B indicates SIMULTANEOUS MUTATIONS ONLY

### Functions

In [2]:
def beta_p(a_mutated_branches, b_mutated_branches, 
           prior_mean, PSEUDOCOUNT_VALUE = 1, verbose=False):
    
    """
        Computes p-value of the inferred mu from the non-mutated branches versus the beta
    distribution inferred from the mutated branches
    
    Parameters
    ----------
    a_mutated_branches: int
        number of background mutated branches with a mutation event
    b_mutated_branches: int
        number of background mutated branches WITHOUT a mutation event
    prior_mean: float
        expected number of same branch mutations if mutation i and j are independent
    PSEUDOCOUNT_VALUE: int, optional (default: 1)
        because beta distribution parameters must be greater than 0, pseudocount
        value to introduce
    verbose: bool, optional (defualt: False)
        if true, print debugging statements
        
    Note that "a" and "b" here refer to the input parameters of the beta distribution, alpha and beta 

    
    """
    #pseudocounting
    
    if a_mutated_branches ==0:
        return np.nan
    if b_mutated_branches ==0:
        b_mutated_branches += PSEUDOCOUNT_VALUE

    beta_posterior = stats.beta(a=a_mutated_branches, b=b_mutated_branches)
    
    pval = beta_posterior.cdf(prior_mean)
    if verbose:
        print(f"a_mutated_branches: {a_mutated_branches}")
        print(f"b_mutated_branches: {b_mutated_branches}")
        print(f"expected ps: {prior_mean}")
        print(f"pvalue:{pval}")
        
    return pval

def apply_testing(df, total_branches=None, verbose=False, run_hypergeometric=False, run_betabin=False):
    """
    Function to apply the statistical tests to the dataframe of co-ocurring mutations
    
    Applies the test in both directions, ie tests mutation A -> mutation B and mutation B -> mutation A
    """
    if total_branches is None:
        total_branches = len(np.load("output/00.mutation_data_by_lineage/00.combined_data/all_branches.npy"))
        print(total_branches)
    
    df["total_branches_applicable"] = total_branches - df.n_branches_with_i - df.n_branches_with_j + df.n_branch_i_and_j -\
                                    df.n_mutations_i - df.n_mutations_j + df.n_muts_same_branch
    
    print('running beta i on j')
    df['pval_beta_i_on_j'] = df.apply(lambda row: beta_p (
        row['n_muts_same_branch'], 
        row["total_branches_applicable"], 
        row.n_mutations_i * row.n_mutations_j / (row.total_branches_applicable * row.total_branches_applicable),
        verbose=verbose
    ), axis = 1) 
    df["pval_beta_j_on_i"] = df["pval_beta_i_on_j"]
    
    return df

### Compute FDR control using Benjamini-Hochberg

def benjamini_hochberg(df, pval_column, threshold = 0.001):
    df = df.sort_values(pval_column)
    M = len(df)
    df["_rank"] = [x for x in range(len(df))]
    df["BH_critical_value"] = [threshold * x/M for x in df._rank]
    df[pval_column+"_BH_sig"] = [p<=x for p,x in zip(df[pval_column], df.BH_critical_value)]
    df = df.drop(["_rank"], axis=1)
    return df
def holms(df, pval_column, threshold = 0.001):
    df = df.sort_values(pval_column)
    output = multipletests(df[pval_column], threshold, method="hs")
    df[pval_column+"_BH_sig"] = output[0]
    return df


def unravel_ij(df):
    """
    Takes the dataframe of mutation A, mutation B pairs, and doubles the size, so that
    mutation A -> mutation B and mutation B -> mutation A each get their own rows. 
    IE, now directionality matters
    """
    
    reversal_df = deepcopy(df)
    reversal_df = reversal_df.rename({
        "position_i":"position_j",
        "position_j":"position_i",
        "n_mutations_i":"n_mutations_j",
        "n_mutations_j":"n_mutations_i",
        "n_branches_with_i":"n_branches_with_j",
        "n_branches_with_j":"n_branches_with_i",
        "n_muts_i_on_branch_j":"n_muts_j_on_branch_i",
        "n_muts_j_on_branch_i":"n_muts_i_on_branch_j",
        "n_branch_with_i_no_j":"n_branch_with_j_no_i",
        "n_branch_with_j_no_i":"n_branch_with_i_no_j",
        "n_unmutated_branches_i":"n_unmutated_branches_j",
        "n_unmutated_branches_j":"n_unmutated_branches_i",
        "n_mutations_i_on_unmutated":"n_mutations_j_on_unmutated",
        "n_mutations_j_on_unmutated":"n_mutations_i_on_unmutated",
        "pval_betabin_j_on_i":"pval_betabin_i_on_j",
        "pval_betabin_i_on_j": "pval_betabin_j_on_i",
        "pval_beta_j_on_i":"pval_beta_i_on_j",
        "pval_beta_i_on_j":"pval_beta_j_on_i",
    },axis=1)

    df = pd.concat([df,reversal_df])
    
    df = df[['position_i', 'position_j', 
             'n_mutations_i', 'n_mutations_j', 
             'n_branches_with_i', 'n_branches_with_j', 
             'n_muts_i_on_branch_j', 'n_muts_j_on_branch_i',
             "n_muts_same_branch",
             'n_branch_with_i_no_j', 'n_branch_with_j_no_i', 'n_branch_i_and_j',
             'pval_beta_i_on_j'
            ]]
    return df

## Apply Testing

In [3]:
!mkdir output/02B.comutation_pvalues

mkdir: cannot create directory ‘output/02B.comutation_pvalues’: File exists


In [4]:
# Run the calculation for all the points in the dataset
df = pd.read_csv("output/01.mutation_and_comutation_data/combined_position.csv", index_col=0)
print(len(df)*2)
df = df.query("n_muts_same_branch > 0")
print(len(df))
#df = df.query("position_j==2030521 and position_i==2030472")
#print(df)

subset=apply_testing(df, verbose=False)

results = subset
print(len(results), len(subset))
results.to_csv("output/02B.comutation_pvalues/intermediate_results_unfiltered.csv")


1816118
150518
62846
running beta i on j
150518 150518


In [5]:
results.sort_values("pval_beta_i_on_j")

Unnamed: 0,position_i,mat_index_i,mat_index_i.1,position_j,mat_index_j,mat_index_j.1,n_mutations_i,n_mutations_j,n_reversions_i,n_reversions_j,...,n_branches_with_j,n_muts_i_on_branch_j,n_muts_j_on_branch_i,n_muts_same_branch,n_branch_with_i_no_j,n_branch_with_j_no_i,n_branch_i_and_j,total_branches_applicable,pval_beta_i_on_j,pval_beta_j_on_i
84085,105060,233,232,105063,234,233,376,361,7,7,...,4418,2,0,352,82,8,4410,57961,0.000000e+00,0.000000e+00
526463,1341114,1627,1608,1341120,1628,1609,184,222,6,7,...,3564,0,0,167,82,442,3122,58961,0.000000e+00,0.000000e+00
285534,761155,858,857,2155168,2418,2392,1687,2098,325,99,...,12564,419,88,687,1758,5157,7407,45426,0.000000e+00,0.000000e+00
876441,3847215,3937,3890,3847221,3938,3891,161,145,1,1,...,3490,0,2,136,354,14,3476,58832,5.497857e-288,5.497857e-288
713337,2197331,2549,2510,2197332,2550,2511,124,126,0,0,...,630,1,0,122,0,2,628,62088,6.988375e-277,6.988375e-277
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313773,781687,972,971,1472359,1741,1721,918,234,41,7,...,870,2,6,1,5301,868,2,55524,9.791197e-01,9.791197e-01
688210,2155168,2418,2392,2626108,3024,2983,2098,40,99,9,...,45234,1505,1,2,2491,35161,10073,12985,9.883740e-01,9.883740e-01
286043,761155,858,857,2626108,3024,2983,1687,40,325,9,...,45234,1325,1,1,1727,37796,7438,14159,9.914914e-01,9.914914e-01
284312,761155,858,857,761161,860,859,1687,153,325,1,...,304,0,21,1,9165,304,0,51538,9.933185e-01,9.933185e-01


In [6]:
results = pd.read_csv("output/02B.comutation_pvalues/intermediate_results_unfiltered.csv", index_col=0)
results = results[[
             'position_i', 'position_j', 
             'n_mutations_i', 'n_mutations_j', 
             'n_branches_with_i', 'n_branches_with_j', 
             'n_muts_i_on_branch_j', 'n_muts_j_on_branch_i',
             "n_muts_same_branch",
             'n_branch_with_i_no_j', 'n_branch_with_j_no_i', 'n_branch_i_and_j',
             'pval_beta_i_on_j'
            ]]


results = benjamini_hochberg(results, "pval_beta_i_on_j", threshold=0.01)

results.to_csv("output/02B.comutation_pvalues/results_file_allpairs.csv")

In [7]:
results.sort_values("pval_beta_i_on_j")

Unnamed: 0,position_i,position_j,n_mutations_i,n_mutations_j,n_branches_with_i,n_branches_with_j,n_muts_i_on_branch_j,n_muts_j_on_branch_i,n_muts_same_branch,n_branch_with_i_no_j,n_branch_with_j_no_i,n_branch_i_and_j,pval_beta_i_on_j,BH_critical_value,pval_beta_i_on_j_BH_sig
84085,105060,105063,376,361,4492,4418,2,0,352,82,8,4410,0.000000e+00,0.000000e+00,True
526463,1341114,1341120,184,222,3204,3564,0,0,167,82,442,3122,0.000000e+00,6.643724e-08,True
285534,761155,2155168,1687,2098,9165,12564,419,88,687,1758,5157,7407,0.000000e+00,1.328745e-07,True
876441,3847215,3847221,161,145,3830,3490,0,2,136,354,14,3476,5.497857e-288,1.993117e-07,True
713337,2197331,2197332,124,126,628,630,1,0,122,0,2,628,6.988375e-277,2.657489e-07,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313773,781687,1472359,918,234,5303,870,2,6,1,5301,868,2,9.791197e-01,9.999668e-03,False
688210,2155168,2626108,2098,40,12564,45234,1505,1,2,2491,35161,10073,9.883740e-01,9.999734e-03,False
286043,761155,2626108,1687,40,9165,45234,1325,1,1,1727,37796,7438,9.914914e-01,9.999801e-03,False
284312,761155,761161,1687,153,9165,304,0,21,1,9165,304,0,9.933185e-01,9.999867e-03,False


In [8]:
results.query("pval_beta_i_on_j_BH_sig")

Unnamed: 0,position_i,position_j,n_mutations_i,n_mutations_j,n_branches_with_i,n_branches_with_j,n_muts_i_on_branch_j,n_muts_j_on_branch_i,n_muts_same_branch,n_branch_with_i_no_j,n_branch_with_j_no_i,n_branch_i_and_j,pval_beta_i_on_j,BH_critical_value,pval_beta_i_on_j_BH_sig
84085,105060,105063,376,361,4492,4418,2,0,352,82,8,4410,0.000000e+00,0.000000e+00,True
526463,1341114,1341120,184,222,3204,3564,0,0,167,82,442,3122,0.000000e+00,6.643724e-08,True
285534,761155,2155168,1687,2098,9165,12564,419,88,687,1758,5157,7407,0.000000e+00,1.328745e-07,True
876441,3847215,3847221,161,145,3830,3490,0,2,136,354,14,3476,5.497857e-288,1.993117e-07,True
713337,2197331,2197332,124,126,628,630,1,0,122,0,2,628,6.988375e-277,2.657489e-07,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280474,761128,2197313,43,146,24,738,0,0,2,24,738,0,4.808831e-03,4.810986e-03,True
558623,1472742,3895159,30,10,44,537,1,0,1,40,533,4,4.809225e-03,4.811052e-03,True
819173,2953581,3895157,15,20,46,541,0,0,1,46,541,0,4.809610e-03,4.811119e-03,True
432456,1164571,2122395,387,277,14710,19900,87,50,10,13279,18469,1431,4.810110e-03,4.811185e-03,True
