# Parameters for fast search

Mouse vs fly reference genomes

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('parameter_tuning.time', sep='\t')
df.head()

Unnamed: 0,time,mem,error-code,command,bins,k,query-seg,min-len,er,thresh,cmin,cmax,ibf-size,matches,repeats
0,101.64,27338368,0,valik-search,1024,19,30000,150,0.025,20,0,40,14G,19185,503
1,56.28,23905852,0,valik-search,1024,19,30000,150,0.025,30,0,40,14G,15074,112
2,46.06,21506668,0,valik-search,1024,19,30000,150,0.025,40,0,40,14G,13260,45
3,41.1,21896776,0,valik-search,1024,19,30000,150,0.025,50,0,40,14G,13035,22
4,37.81,21582872,0,valik-search,1024,19,30000,150,0.025,60,0,40,14G,12871,16


In [6]:
df["ibf-size"] = df["ibf-size"].str.rstrip("G")
df["ibf-size"] = df["ibf-size"].str.replace(",", ".")
pd.to_numeric(df["ibf-size"])
df = df.drop(axis = 1, columns = ["error-code", "command", "bins", "er", "min-len"])
df.head()

Unnamed: 0,time,mem,k,query-seg,thresh,cmin,cmax,ibf-size,matches,repeats
0,101.64,27338368,19,30000,20,0,40,14,19185,503
1,56.28,23905852,19,30000,30,0,40,14,15074,112
2,46.06,21506668,19,30000,40,0,40,14,13260,45
3,41.1,21896776,19,30000,50,0,40,14,13035,22
4,37.81,21582872,19,30000,60,0,40,14,12871,16


In [7]:
df[df["matches"] == np.max(df["matches"])]

Unnamed: 0,time,mem,k,query-seg,thresh,cmin,cmax,ibf-size,matches,repeats
276,77.89,10856688,21,20000,30,1,250,3.3,41496,138


In [16]:
np.mean(df[df["cmax"] > 200]["matches"])

30705.05

In [18]:
df[(df["thresh"] == 50) & (df["cmin"]==0) & (df["query-seg"]==20000)]

Unnamed: 0,time,mem,k,query-seg,thresh,cmin,cmax,ibf-size,matches,repeats
88,41.5,21363624,19,20000,50,0,40,14,13378,21
98,42.86,21346332,19,20000,50,0,50,14,14292,43
108,47.19,21456940,19,20000,50,0,60,14,19533,51
118,47.81,21329980,19,20000,50,0,70,14,23416,82
128,64.82,21700008,19,20000,50,0,250,14,29251,101
188,40.57,21536576,21,20000,50,0,40,14,10794,10
198,42.75,21228928,21,20000,50,0,50,14,14963,30
208,42.39,21176384,21,20000,50,0,60,14,17910,68
218,44.27,21202580,21,20000,50,0,70,14,20245,69
228,61.69,21699796,21,20000,50,0,250,14,27720,82


In [6]:
def find_corr_for(corr_base):
    print("Correlation of " + corr_base + " with")
    for col in df.drop(axis = 1, columns = [corr_base]).columns:
        print("\t" + col + "\t" + str(round(df[corr_base].corr(df[col]), 3)))    

In [7]:
find_corr_for("time")

Correlation of time with
	mem	0.624
	k	-0.085
	query-seg	-0.192
	thresh	-0.726
	cmin	-0.304
	cmax	0.362
	ibf-size	0.304
	matches	0.531
	repeats	0.909


In [8]:
find_corr_for("matches")

Correlation of matches with
	time	0.531
	mem	0.197
	k	0.044
	query-seg	-0.016
	thresh	-0.378
	cmin	-0.112
	cmax	0.608
	ibf-size	0.112
	repeats	0.4


In [9]:
find_corr_for("repeats")

Correlation of repeats with
	time	0.909
	mem	0.544
	k	-0.157
	query-seg	-0.01
	thresh	-0.656
	cmin	-0.212
	cmax	0.238
	ibf-size	0.211
	matches	0.4


In [10]:
df_all_kmers = df[(df["cmin"] == 0) & (df["thresh"] >= 50)]
np.mean(df_all_kmers["time"])

45.84767857142857

In [11]:
df_all_kmers = df[df["cmin"] == 0]
np.mean(df_all_kmers["time"])

66.83271428571429

In [15]:
np.mean(df_all_kmers["matches"])

22643.22142857143

In [12]:
df_abundant_kmers = df[(df["cmin"] == 1) & (df["thresh"] < 50)]
np.mean(df_abundant_kmers["time"])

59.39678571428571

In [13]:
df_abundant_kmers = df[df["cmin"]==1]
np.mean(df_abundant_kmers["time"])

51.03807142857143

In [14]:
np.mean(df_abundant_kmers["matches"])

21107.285714285714

In [13]:
df[df["matches"] == np.max(df["matches"])]

Unnamed: 0,time,mem,error-code,command,bins,k,query-seg,min-len,er,thresh,cmin,cmax,ibf-size,matches,repeats
276,77.89,10856688,0,valik-search,1024,21,20000,150,0.025,30,1,250,"3,3G",41496,138


In [14]:
df[df["time"] == np.min(df["time"])]

Unnamed: 0,time,mem,error-code,command,bins,k,query-seg,min-len,er,thresh,cmin,cmax,ibf-size,matches,repeats
23,31.16,10111328,0,valik-search,1024,19,30000,150,0.025,50,1,40,"3,2G",9365,7


In [44]:
df_abundant_kmers["thresh"].corr(df_abundant_kmers["matches"])

0.06979869579846797

## Query segment count

In [17]:
df_10000 = df[df["query-seg"]==10000]
df_20000 = df[df["query-seg"]==20000]
df_30000 = df[df["query-seg"]==30000]

In [20]:
def get_means(df):
    print("mean runtime\t" + str(np.mean(df["time"])))
    print("mean match count\t" + str(np.mean(df["matches"])))
    print("mean repeats\t" + str(np.mean(df["repeats"])))

In [21]:
get_means(df_10000)

mean runtime	63.96189999999999
mean match count	21200.05
mean repeats	119.26


In [23]:
get_means(df_20000)

mean runtime	59.9971
mean match count	23473.01
mean repeats	130.86


In [25]:
get_means(df_30000)

mean runtime	51.325125
mean match count	20722.0625
mean repeats	114.3625


## Dealing with repeats

In [29]:
df_30000["cmax"].corr(df_30000["matches"])

0.8168837549175402

In [16]:
df["cmax"].corr(df["time"])

0.3622211807726669