In [1]:
import os
import numpy as np
import pandas as pd
import itertools
import json

In [2]:
path = './logs/'
files = np.array(os.listdir(path))

In [3]:
pval = np.array([np.genfromtxt(path + file) for file in files])
pval = np.stack(pval) # convert np array of np arrays to 2d np ndarray

M = pval >= 0.01 # pass-fail matrix (, 100000)

In [4]:
# defining constants
tests = np.array([file[:-len('.txt')] for file in files])
alpha = 0.01
ntests = M.shape[0]
nseqs = M.shape[1]
exp_coverage = np.array([1 - (1 - alpha)**k for k in range(ntests+1)])

In [5]:
F = np.zeros([ntests, ntests], dtype=float)
for i in range(ntests-1):
    for j in range(i+1, ntests):
        F[i][j] = F[j][i] = np.count_nonzero(~(M[i] | M[j])) # failed both (De Morgan's Law)
        
for i in range(ntests):
    F[i] = F[i] / (np.size(M[i]) - np.count_nonzero(M[i]))

In [6]:
# Fail-fail ratio table
FFR = pd.DataFrame(F, index = tests, columns = tests, dtype = float)
FFR.to_csv('fail_fail_ratio.csv')
FFR

Unnamed: 0,NonOverlappingTemplate84,NonOverlappingTemplate125,NonOverlappingTemplate113,NonOverlappingTemplate35,NonOverlappingTemplate127,NonOverlappingTemplate7,NonOverlappingTemplate52,OverlappingTemplate,NonOverlappingTemplate134,NonOverlappingTemplate12,...,NonOverlappingTemplate11,NonOverlappingTemplate138,NonOverlappingTemplate129,RandomExcursionsVariant11,NonOverlappingTemplate68,RandomExcursions7,NonOverlappingTemplate70,RandomExcursionsVariant8,NonOverlappingTemplate79,NonOverlappingTemplate118
NonOverlappingTemplate84,0.000000,0.013477,0.006289,0.010782,0.010782,0.010782,0.011680,0.010782,0.008086,0.014376,...,0.009883,0.014376,0.015274,0.412399,0.017071,0.408805,0.010782,0.408805,0.015274,0.015274
NonOverlappingTemplate125,0.013345,0.000000,0.007117,0.014235,0.011566,0.008007,0.014235,0.010676,0.008897,0.005338,...,0.006228,0.014235,0.008897,0.392349,0.006228,0.397687,0.010676,0.394128,0.016904,0.008007
NonOverlappingTemplate113,0.006278,0.007175,0.000000,0.007175,0.007175,0.017040,0.010762,0.016143,0.008072,0.014350,...,0.012556,0.004484,0.010762,0.391031,0.017040,0.393722,0.010762,0.392825,0.018834,0.010762
NonOverlappingTemplate35,0.010840,0.014453,0.007227,0.000000,0.009937,0.016260,0.013550,0.015357,0.011743,0.010840,...,0.015357,0.012647,0.013550,0.397471,0.008130,0.397471,0.009937,0.398374,0.013550,0.009033
NonOverlappingTemplate127,0.010870,0.011775,0.007246,0.009964,0.000000,0.012681,0.015399,0.015399,0.011775,0.009964,...,0.008152,0.013587,0.012681,0.395833,0.011775,0.390399,0.010870,0.392210,0.006341,0.009058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RandomExcursions7,0.011514,0.011312,0.011109,0.011135,0.010907,0.011869,0.011034,0.012856,0.011666,0.011995,...,0.011287,0.010527,0.010983,0.984614,0.010603,0.000000,0.011160,0.982969,0.010755,0.010806
NonOverlappingTemplate70,0.010480,0.010480,0.010480,0.009607,0.010480,0.010480,0.018341,0.013100,0.008734,0.018341,...,0.010480,0.008734,0.009607,0.383406,0.010480,0.385153,0.000000,0.384279,0.020087,0.009607
RandomExcursionsVariant8,0.011541,0.011236,0.011109,0.011186,0.010983,0.011820,0.010805,0.012809,0.011566,0.011946,...,0.011160,0.010754,0.011059,0.985314,0.010602,0.985213,0.011160,0.000000,0.010881,0.010754
NonOverlappingTemplate79,0.015413,0.017226,0.019039,0.013599,0.006346,0.009973,0.010879,0.013599,0.009066,0.014506,...,0.009066,0.004533,0.010879,0.391659,0.010879,0.385313,0.020852,0.388939,0.000000,0.013599


In [7]:
def cov(subset):
    # sequences that pass all tests
    pass_all = np.ones((nseqs), dtype=bool)
    for test in subset:
        pass_all = pass_all & M[test]

    # calculate coverage
    coverage = np.count_nonzero(~pass_all)/nseqs
    return coverage

In [8]:
rows_list = []

# considering all tests
total_coverage = cov(range(ntests))

# excluding one test at a time
for subset in itertools.combinations(range(ntests), ntests-1): # excluding one test at a time
    test_idx = sum(range(ntests)) - sum(subset)
    test = tests[test_idx]
    
    coverage = cov(subset)
    benefit = total_coverage - coverage
    
    # add row
    rows_list.append({'Test': test, 'MarginalBenefit': benefit})

In [9]:
# Marginal Benefits
MB = pd.DataFrame(rows_list, columns=['Test', 'MarginalBenefit'])
MB = MB.sort_values(by=['MarginalBenefit'], ascending=False)
MB = MB.set_index('Test')
MB.to_csv('marginal_benefits.csv')
MB

Unnamed: 0_level_0,MarginalBenefit
Test,Unnamed: 1_level_1
Serial1,0.01261
Serial2,0.00296
ApproximateEntropy,0.00128
NonOverlappingTemplate5,0.00108
OverlappingTemplate,0.00104
...,...
NonOverlappingTemplate74,0.00001
Frequency,0.00001
NonOverlappingTemplate75,0.00000
NonOverlappingTemplate1,0.00000


In [10]:
rows_list = []

# change accordingly
MAX_SUBSET_SIZE = 5

for k in range(2, MAX_SUBSET_SIZE):
    max_coverage = 0
    for subset in itertools.combinations(range(ntests), k): # subsets of k tests
        coverage = cov(subset)
        # update max_coverage and max_subset
        if coverage > max_coverage:
            max_coverage = coverage
            max_subset = subset
    
    # update max_efficiency
    max_efficiency = max_coverage/exp_coverage[k]
    
    # add row
    rows_list.append({'k': k, 'Subsuite': tests[list(max_subset)], 'Coverage': max_coverage, 'Efficiency': max_efficiency})
    print(k, "done!")

2 done!
3 done!


KeyboardInterrupt: 

In [None]:
# Most efficient subsuites
MES = pd.DataFrame(rows_list, columns=['k', 'Subsuite', 'Coverage', 'Efficiency'])
MES.to_json('most_efficienct_subsuites.json', orient='records')
MES = MES.set_index('k')
MES