In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from copy import deepcopy
import time
import random
from scipy import stats
from scipy.stats import mode
import seaborn as sns
import warnings
from collections import defaultdict
warnings.filterwarnings("ignore")

### Functions

In [5]:
def beta_binomial_p(a_mutated_branches, b_mutated_branches, 
           a_unmutated_branches, b_unmutated_branches, PSEUDOCOUNT_VALUE = 1, verbose=False):
    
    
    """
    Note: this is not the main p-value function and was only tested during initial project stages
    
    Computes p-value of the inferred mu from the non-mutated branches versus the posterior
    distribution of mutated + unmutated branches
    
    """
   #pseudocounting
    if a_mutated_branches ==0:
        return np.nan
    if a_mutated_branches ==0:
        b_mutated_branches += PSEUDOCOUNT_VALUE
    if a_unmutated_branches ==0:
        a_unmutated_branches += PSEUDOCOUNT_VALUE
    if b_unmutated_branches ==0:
        b_unmutated_branches += PSEUDOCOUNT_VALUE

    beta_posterior = stats.beta(a=a_unmutated_branches + a_mutated_branches, b=b_unmutated_branches + b_mutated_branches)
    
    prior_mean = a_unmutated_branches/(a_unmutated_branches+b_unmutated_branches)
    
    pval = beta_posterior.cdf(prior_mean)
    if verbose:
        print(f"a_unmutated_branches: {a_unmutated_branches}")
        print(f"b_unmutated_branches: {b_unmutated_branches}")
        print(f"a_mutated_branches: {a_mutated_branches}")
        print(f"b_mutated_branches: {b_mutated_branches}")
        print(f"pvalue:{pval}")

    return pval

def hypergeometric_p(M, N, n, k, verbose=False):
    
    """
    Note: this is not the main p-value function and was only tested during initial project stages
    Computes the p-value of drawing k mutations on mutated branches out of n total mutations
    
    M: total population
    N: number of successes in population (branches with mutation j)
    n: number of draws
    k: number of successful draws
    """

    
    distr = stats.hypergeom(M, n, N)
    pval = 1 - distr.cdf(k - 0.0000001)
    
    
    if verbose:
        print(f"total branches:{M}\ntotal successes:{N}\ntotal draws:{n}\nNumber of successful draws:{k}")
        print(f"pvalue:{pval}")
        
    return pval


def beta_p(a_mutated_branches, b_mutated_branches, 
           a_unmutated_branches, b_unmutated_branches, PSEUDOCOUNT_VALUE = 1, verbose=False):
    
    """
        Computes p-value of the inferred mu from the non-mutated branches versus the beta
    distribution inferred from the mutated branches
    
    Parameters
    ----------
    a_mutated_branches: int
        number of background mutated branches with a mutation event
    b_mutated_branches: int
        number of background mutated branches WITHOUT a mutation event
    a_unmutated_branches: int
        number of background UNmutated branches with a mutation event
    b_unmutated_branches: int
        number of background UNmutated branches WITHOUT a mutation event
    PSEUDOCOUNT_VALUE: int, optional (default: 1)
        because beta distribution parameters must be greater than 0, pseudocount
        value to introduce
    verbose: bool, optional (defualt: False)
        if true, print debugging statements
        
    Note that "a" and "b" here refer to the input parameters of the beta distribution, alpha and beta 

    
    """
    #pseudocounting
    
    if a_mutated_branches ==0:
        return np.nan
    if a_mutated_branches ==0:
        b_mutated_branches += PSEUDOCOUNT_VALUE
    if a_unmutated_branches ==0:
        a_unmutated_branches += PSEUDOCOUNT_VALUE
    if b_unmutated_branches ==0:
        b_unmutated_branches += PSEUDOCOUNT_VALUE

    beta_posterior = stats.beta(a=a_mutated_branches, b=b_mutated_branches)
    
    prior_mean = a_unmutated_branches/(a_unmutated_branches+b_unmutated_branches)
    
    pval = beta_posterior.cdf(prior_mean)
    if verbose:
        print(f"a_unmutated_branches: {a_unmutated_branches}")
        print(f"b_unmutated_branches: {b_unmutated_branches}")
        print(f"a_mutated_branches: {a_mutated_branches}")
        print(f"b_mutated_branches: {b_mutated_branches}")
        print(f"pvalue:{pval}")
        
    return pval

def apply_testing(df, total_branches=None, verbose=False, run_hypergeometric=False, run_betabin=False):
    """
    Function to apply the statistical tests to the dataframe of co-ocurring mutations
    
    Applies the test in both directions, ie tests mutation A -> mutation B and mutation B -> mutation A
    """
    if total_branches is None:
        total_branches = len(np.load("00.mutation_data_by_lineage/00.combined_data/all_branches.npy"))
    
    df["total_branches_applicable"] = total_branches - df.n_branch_i_and_j

    df["n_unmutated_branches_i"] = df.total_branches_applicable - df.n_branch_with_i_no_j
    df["n_unmutated_branches_j"] = df.total_branches_applicable - df.n_branch_with_j_no_i
    df["n_mutations_i_on_unmutated"] = df.n_mutations_i -df.n_muts_i_on_branch_j
    df["n_mutations_j_on_unmutated"] = df.n_mutations_j  -df.n_muts_j_on_branch_i
    
    if run_hypergeometric:
        print('running hypergeometric i on j')
        df['pval_hypergeo_i_on_j'] = df.apply(lambda row: hypergeometric_p (
            row['total_branches_applicable'] - row["n_branch_with_i_no_j"], 
            row['n_branch_with_j_no_i'], 
            row['n_mutations_i'],
            row['n_muts_i_on_branch_j'] + row['n_muts_same_branch'], verbose=verbose
        ), axis = 1) 
        print('running hypergeometric j on i')
        df['pval_hypergeo_j_on_i'] = df.apply(lambda row: hypergeometric_p (
            row['total_branches_applicable'] - row["n_branch_with_j_no_i"], 
            row['n_branch_with_i_no_j'], 
            row['n_mutations_j'],
            row['n_muts_j_on_branch_i'] + row['n_muts_same_branch'], verbose=verbose
        ), axis = 1) 
    if run_betabin:
        print('running beta-binomial i on j')
        df['pval_betabin_i_on_j'] = df.apply(lambda row: beta_binomial_p (
            row['n_muts_i_on_branch_j'] + row['n_muts_same_branch'], 
            row['n_branch_with_j_no_i'] - row['n_muts_i_on_branch_j'], 
            row['n_mutations_i_on_unmutated'],
            row["n_unmutated_branches_i"] - row['n_mutations_i_on_unmutated'], verbose=verbose
        ), axis = 1) 
        print('running beta-binomial j on i')
        df['pval_betabin_j_on_i'] = df.apply(lambda row: beta_binomial_p (
            row['n_muts_j_on_branch_i'] + row['n_muts_same_branch'], 
            row['n_branch_with_i_no_j'] - row['n_muts_j_on_branch_i'], 
            row['n_mutations_j_on_unmutated'],
            row["n_unmutated_branches_j"] - row['n_mutations_j_on_unmutated'], verbose=verbose
        ), axis = 1) 
    print('running beta i on j')
    df['pval_beta_i_on_j'] = df.apply(lambda row: beta_p (
        row['n_muts_i_on_branch_j'] + row['n_muts_same_branch'], 
        row['n_branch_with_j_no_i'] - row['n_muts_i_on_branch_j'], 
        row['n_mutations_i_on_unmutated'],
        row["n_unmutated_branches_i"] - row['n_mutations_i_on_unmutated'] - row['n_branch_with_j_no_i'], verbose=verbose
    ), axis = 1) 
    print('running beta j on i')
    df['pval_beta_j_on_i'] = df.apply(lambda row: beta_p (
        row['n_muts_j_on_branch_i'] + row['n_muts_same_branch'], 
        row['n_branch_with_i_no_j'] - row['n_muts_j_on_branch_i'], 
        row['n_mutations_j_on_unmutated'],
        row["n_unmutated_branches_j"] - row['n_mutations_j_on_unmutated'] - row['n_branch_with_i_no_j'], verbose=verbose
    ), axis = 1) 
    
    return df

### Compute FDR control using Benjamini-Hochberg

def benjamini_hochberg(df, pval_column, threshold = 0.001):
    df = df.sort_values(pval_column)
    M = len(df)
    df["_rank"] = [x for x in range(len(df))]
    df["BH_critical_value"] = [threshold * x/M for x in df._rank]
    df[pval_column+"_BH_sig"] = [p<=x for p,x in zip(df[pval_column], df.BH_critical_value)]
    df = df.drop(["_rank"], axis=1)
    return df
def holms(df, pval_column, threshold = 0.001):
    df = df.sort_values(pval_column)
    output = multipletests(df[pval_column], threshold, method="hs")
    df[pval_column+"_BH_sig"] = output[0]
    return df


def unravel_ij(df):
    """
    Takes the dataframe of mutation A, mutation B pairs, and doubles the size, so that
    mutation A -> mutation B and mutation B -> mutation A each get their own rows. 
    IE, now directionality matters
    """
    
    reversal_df = deepcopy(df)
    reversal_df = reversal_df.rename({
        "position_i":"position_j",
        "position_j":"position_i",
        "n_mutations_i":"n_mutations_j",
        "n_mutations_j":"n_mutations_i",
        "n_branches_with_i":"n_branches_with_j",
        "n_branches_with_j":"n_branches_with_i",
        "n_muts_i_on_branch_j":"n_muts_j_on_branch_i",
        "n_muts_j_on_branch_i":"n_muts_i_on_branch_j",
        "n_branch_with_i_no_j":"n_branch_with_j_no_i",
        "n_branch_with_j_no_i":"n_branch_with_i_no_j",
        "n_unmutated_branches_i":"n_unmutated_branches_j",
        "n_unmutated_branches_j":"n_unmutated_branches_i",
        "n_mutations_i_on_unmutated":"n_mutations_j_on_unmutated",
        "n_mutations_j_on_unmutated":"n_mutations_i_on_unmutated",
        "pval_betabin_j_on_i":"pval_betabin_i_on_j",
        "pval_betabin_i_on_j": "pval_betabin_j_on_i",
        "pval_beta_j_on_i":"pval_beta_i_on_j",
        "pval_beta_i_on_j":"pval_beta_j_on_i",
    },axis=1)

    df = pd.concat([df,reversal_df])
    
    df = df[['position_i', 'position_j', 
             'n_mutations_i', 'n_mutations_j', 
             'n_branches_with_i', 'n_branches_with_j', 
             'n_muts_i_on_branch_j', 'n_muts_j_on_branch_i',
             "n_muts_same_branch",
             'n_branch_with_i_no_j', 'n_branch_with_j_no_i', 'n_branch_i_and_j',
             'pval_beta_i_on_j'
            ]]
    return df

## Apply Testing

In [None]:
# Run the calculation for all the points in the dataset
df = pd.read_csv("01.mutation_and_comutation_data/combined_position.csv", index_col=0)
subset=apply_testing(df)

results = unravel_ij(subset)
print(len(results), len(subset))
results.to_csv("02.comutation_pvalues/intermediate_results_unfiltered.csv")
results = pd.read_csv("02.comutation_pvalues/intermediate_results_unfiltered.csv", index_col=0)
results = results[[
             'position_i', 'position_j', 
             'n_mutations_i', 'n_mutations_j', 
             'n_branches_with_i', 'n_branches_with_j', 
             'n_muts_i_on_branch_j', 'n_muts_j_on_branch_i',
             "n_muts_same_branch",
             'n_branch_with_i_no_j', 'n_branch_with_j_no_i', 'n_branch_i_and_j',
             'pval_beta_i_on_j'
            ]]


results = benjamini_hochberg(results, "pval_beta_i_on_j", threshold=0.01)

results.to_csv("02.comutation_pvalues/results_file_allpairs.csv")

running beta i on j
running beta j on i
