## Difference in disatnce between methods

To understand whether smart UMI and SGA are similar enough in their outputs, we can compare the pairwise distance matrix from trees. Our null hypothesis is that SGA and SmrtUMI should not have a significant difference between their distances. We are expecting a small difference in median distances between the two methods.

In the file naming convention, columns/indexes starting with 'K' are SGA. Rest are Smrt-UMI.

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import mannwhitneyu
import os
import matplotlib.pyplot as plt

import scipy

np.set_printoptions(legacy='1.25')

In [None]:
filepath = "./data/1HB3_d0_NT_matrix.csv" # update filename

# reading in distance matrix
distance_matrix = pd.read_csv(filepath, index_col = 0)

In [None]:
def compare_distances(filepath):
    # reading in distance matrix
    distance_matrix = pd.read_csv(filepath, index_col = 0)
    
    # Separate SGA and SmrtUMI names
    sga = distance_matrix.columns[distance_matrix.columns.str.startswith('K')]
    smrt = distance_matrix.columns[~distance_matrix.columns.str.startswith('K')]

    # getting DFs
    sga = distance_matrix.loc[sga, sga]
    smrt = distance_matrix.loc[smrt, smrt]

    # extracting distances
    upper_tri = np.triu(sga, k=1).flatten()
    sga_vals = upper_tri[upper_tri != 0]

    upper_tri = np.triu(smrt, k=1).flatten()
    smrt_vals = upper_tri[upper_tri != 0]
    
    # performing Mann Whitney U test
    stat, p = mannwhitneyu(sga_vals, smrt_vals, method="auto")
    med_smrt = np.median(smrt_vals)
    med_sga = np.median(sga_vals)
    
    return(stat, p, med_smrt, med_sga)

In [None]:
# Retrieve all files from a directory containing bootstrapped tree distances

all_files = os.listdir('./09_24_panmixia_matrices/')
all_files = ['./09_24_panmixia_matrices/'+file for file in all_files]

In [None]:
ps = []
# get distance statistics for each file
for file in all_files:
    stat, p, med_smrt, med_sga = compare_distances(file)
#     print(compare_distances(file))
    ps.append(p)

In [None]:
# repeat the process on all files using T-Test (Ind) instead
p_ttest = []

for i in range(len(all_files)):
    filepath = all_files[i]
    # reading in distance matrix
    distance_matrix = pd.read_csv(filepath, index_col = 0)

    # Separate SGA and SmrtUMI names
    sga = distance_matrix.columns[distance_matrix.columns.str.startswith('K')]
    smrt = distance_matrix.columns[~distance_matrix.columns.str.startswith('K')]

    # getting DFs
    sga = distance_matrix.loc[sga, sga]
    smrt = distance_matrix.loc[smrt, smrt]

    # extracting distances
    upper_tri = np.triu(sga, k=1).flatten()
    sga_vals = upper_tri[upper_tri != 0]

    upper_tri = np.triu(smrt, k=1).flatten()
    smrt_vals = upper_tri[upper_tri != 0]

    # stat, p = mannwhitneyu(sga_vals, smrt_vals, method="auto")
    med_smrt = np.median(smrt_vals)
    med_sga = np.median(sga_vals)

    print("SGA:", pd.DataFrame(sga_vals).describe())
    print('----------------')
    print("SMRT:", pd.DataFrame(smrt_vals).describe())

    print(stats.ttest_ind(sga_vals, smrt_vals))

    stat, p = stats.ttest_ind(sga_vals, smrt_vals)
    p_ttest.append(p)

In [None]:
# Performing multipletest adjustments
import statsmodels.api as sm
from statsmodels.stats.multitest import multipletests

p_adj = multipletests(p_ttest, method='fdr_bh')[1]
p_adj[p_adj > 0.05]

In [None]:
sample_names = [f.split('/')[-1].replace('.csv', '') for f in all_files]

ttest_df = pd.DataFrame(sample_names, columns=['Sample'])
ttest_df['p.adj'] = p_adj

ttest_df['intermingled(p>0.05)'] = ttest_df['p.adj'] > 0.05

ttest_df['intermingled(p>1E-4)'] = ttest_df['p.adj'] > 0.0001

In [None]:
ttest_df

In [None]:
# Save file as CSV
#ttest_df.to_csv('09_24_panmixia-ttest.csv', index=False)