In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy.core.defchararray as npchar
import pickle

In [2]:
# set initial properties for data location
res = 40000
data_location = "matrices/"
true_location = "trueTADs/"
armatus_location = "./armatus"
out_location = "yielded/"
noise_values = [4, 8, 12, 16, 20]
sim_values = list(range(1, 6))
TAD_callers = ("lava_modularity", "lava_armatus")

In [3]:
# set range of gammas for methods
armatus_gamma = [i / 2 for i in range(11)]
modularity_gamma = list(range(101))

In [4]:
# custom functions
vector_str = np.vectorize(str)

def TAD_bins(arr1, arr2):
    vector_str = np.vectorize(str)
    return npchar.add(vector_str(arr1), npchar.add(",", vector_str(arr2)))

def TAD_boundaries(arr1, arr2):
    return np.unique(np.append(arr1, arr2))

In [5]:
# load true TADs coordinates
true_TADs = pd.DataFrame(index=noise_values, columns=sim_values)
for noise in noise_values:
    for sim in sim_values:
        print(f"\rnoise={noise} sim={sim}", end="")
        true_TADs.loc[noise, sim] = np.loadtxt(f"{true_location}simHiC_TADintervals_coords_noise{noise}_sim{sim}.txt")
print("\nfinished")

noise=20 sim=5
finished


In [6]:
# combine true TADs coordinates into strings
true_TADs_bins = pd.DataFrame(index=noise_values, columns=sim_values)
for noise in noise_values:
    for sim in sim_values:
        true_TADs_bins.loc[noise, sim] = npchar.add(vector_str(true_TADs.loc[noise, sim][:, 0]), npchar.add(",", vector_str(true_TADs.loc[noise, sim][:, 1])))

In [7]:
# make unique true TADs boundaries
true_TADs_boundaries = pd.DataFrame(index=noise_values, columns=sim_values)
for noise in noise_values:
    for sim in sim_values:
        true_TADs_boundaries.loc[noise, sim] = np.unique(np.append(true_TADs.loc[noise, sim][:, 0], true_TADs.loc[noise, sim][:, 1]))

In [8]:
#TAD_stats = dict.fromkeys(("TPR_bins", "TPR_boundaries", "FDR_bins", "FDR_boundaries"))
stats = ("TPR_bins", "TPR_boundaries", "FDR_bins", "FDR_boundaries")
method_ranges = {"armatus":armatus_gamma, "lava_modularity":modularity_gamma, "lava_armatus":armatus_gamma}
TAD_stats = {stat:{method:{gamma:pd.DataFrame(index=noise_values, columns=sim_values) for gamma in method_ranges[method]} for method in TAD_callers} for stat in stats}

In [21]:
# count TPR and FDR for TADs
for method in TAD_callers:
    for noise in noise_values:
        for sim in sim_values:
            for gamma in method_ranges[method]:
                print(f"\r method={method} noise={noise} sim={sim} gamma={gamma}", end="")
                TADs = np.loadtxt(f"{out_location}{method}_noise{noise}_sim{sim}_gamma{gamma}.txt", ndmin=2)
                bins = TAD_bins(TADs[:, 0], TADs[:, 1])
                boundaries = TAD_boundaries(TADs[:, 0], TADs[:, 1])
                TAD_stats["TPR_bins"][method][gamma].loc[noise, sim] = sum(np.isin(bins, true_TADs_bins.loc[noise, sim])) / bins.shape[0]
                TAD_stats["FDR_bins"][method][gamma].loc[noise, sim] = sum(~np.isin(bins, true_TADs_bins.loc[noise, sim])) / bins.shape[0]
                TAD_stats["TPR_boundaries"][method][gamma].loc[noise, sim] = sum(np.isin(boundaries, true_TADs_boundaries.loc[noise, sim])) / boundaries.shape[0]
                TAD_stats["FDR_boundaries"][method][gamma].loc[noise, sim] = sum(~np.isin(boundaries, true_TADs_boundaries.loc[noise, sim])) / boundaries.shape[0]

 method=lava_armatus noise=20 sim=5 gamma=5.0100

In [24]:
# Initialize dataframe for plotting lava armatus stats
lava_armatus_stats = pd.DataFrame(index=range(len(noise_values) * len(sim_values) * len(armatus_gamma)), columns=list(stats) + list(("noise", "sim", "gamma")))

In [31]:
# Fill lava_armatus_stats with values
# Lazy assignment, make from list of lists
item = 0   
for noise in noise_values:
    for sim in sim_values:
        for gamma in method_ranges["lava_armatus"]:
            print(f"\ritem={item} noise={noise} sim={sim} gamma={gamma}", end="")
            lava_armatus_stats.loc[item, "TPR_bins"] = TAD_stats["TPR_bins"]["lava_armatus"][gamma].loc[noise, sim]
            lava_armatus_stats.loc[item, "FDR_bins"] = TAD_stats["FDR_bins"]["lava_armatus"][gamma].loc[noise, sim]
            lava_armatus_stats.loc[item, "TPR_boundaries"] = TAD_stats["TPR_boundaries"]["lava_armatus"][gamma].loc[noise, sim]
            lava_armatus_stats.loc[item, "FDR_boundaries"] = TAD_stats["FDR_boundaries"]["lava_armatus"][gamma].loc[noise, sim]
            lava_armatus_stats.loc[item, "noise":"gamma"] = [noise, sim, gamma]
            item += 1

item=274 noise=20 sim=5 gamma=5.0

In [34]:
zero_gamma = lava_armatus_stats["gamma"] == 0
lava_armatus_stats.loc[zero_gamma]

Unnamed: 0,TPR_bins,TPR_boundaries,FDR_bins,FDR_boundaries,noise,sim,gamma
0,0.601695,0.725738,0.398305,0.274262,4,1,0.0
11,0.609649,0.737991,0.390351,0.262009,4,2,0.0
22,0.627273,0.769231,0.372727,0.230769,4,3,0.0
33,0.663636,0.773756,0.336364,0.226244,4,4,0.0
44,0.663551,0.795349,0.336449,0.204651,4,5,0.0
55,0.568465,0.698347,0.431535,0.301653,8,1,0.0
66,0.679426,0.809524,0.320574,0.190476,8,2,0.0
77,0.584746,0.71308,0.415254,0.28692,8,3,0.0
88,0.635193,0.735043,0.364807,0.264957,8,4,0.0
99,0.606061,0.732759,0.393939,0.267241,8,5,0.0
