In [227]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

In [228]:
def make_timedelta(df):
    df['seconds'] = df['time']
    df['time'] = pd.to_timedelta(df['time'], unit='s')
    return df

def get_highest_accuracy(df):
    highest_accuracy = pd.DataFrame(columns=df.columns)
   
    i = 0
    for er in np.unique(df["error-rate"]):
        highest_accuracy.loc[i] = list(df[df["matches"] == np.max(df[df["error-rate"] == er]["matches"])].iloc[0])
        i = i + 1
    
    return highest_accuracy

# Mouse vs fly reference genomes

min_len = 150

## Stellar

In [305]:
stellar_time = pd.read_csv("stellar_mouse.time", sep = "\t")
make_timedelta(stellar_time)

Unnamed: 0,time,mem,error-code,command,min-len,error-rate,repeat-period,repeat-length,seconds
0,0 days 02:08:42,10005920,0,stellar-search,150,0.0267,1,1000,7722.00
1,0 days 00:17:04.210000,11743712,0,stellar-search,150,0.0330,4,25,1024.21
2,0 days 01:33:19.200000,11760372,0,stellar-search,150,0.0267,4,50,5599.20
3,0 days 15:31:19.830000,11901144,0,stellar-search,150,0.0400,2,100,55879.83
4,0 days 15:40:35.630000,10417456,0,stellar-search,150,0.0400,1,50,56435.63
...,...,...,...,...,...,...,...,...,...
68,0 days 15:35:34.640000,11913552,0,stellar-search,150,0.0400,4,1000,56134.64
69,0 days 04:37:33.210000,11822848,0,stellar-search,150,0.0330,2,50,16653.21
70,0 days 12:04:42.180000,11909232,0,stellar-search,150,0.0400,2,50,43482.18
71,0 days 00:01:41.250000,11518604,11,stellar-search,150,0.0130,4,100,101.25


In [306]:
stellar_matches = pd.read_csv("stellar_match_count.tsv", sep = "\t")
#stellar_matches.head()

In [307]:
stellar_mouse = pd.merge(stellar_time, stellar_matches, how="left", on=["error-rate", "repeat-period", "repeat-length", "min-len"])
#stellar_mouse.head()

In [308]:
failed_runs = stellar_mouse[(stellar_mouse["error-code"] != 0) & (stellar_mouse["matches"] > 0)].index
stellar_mouse.drop(failed_runs, inplace = True)

In [309]:
wrong_error_code = stellar_mouse[(stellar_mouse["error-code"] == 0) & (stellar_mouse["matches"] == 0)].index
stellar_mouse.drop(wrong_error_code, inplace = True)

In [234]:
stellar_mouse = stellar_mouse[stellar_mouse["error-code"] == 0]
error_rates = np.unique(stellar_mouse["error-rate"])
stellar_mouse.head()

Unnamed: 0,time,mem,error-code,command,min-len,error-rate,repeat-period,repeat-length,seconds,matches
0,0 days 02:08:42,10005920,0,stellar-search,150,0.0267,1,1000,7722.0,103208
1,0 days 00:17:04.210000,11743712,0,stellar-search,150,0.033,4,25,1024.21,91899
2,0 days 01:33:19.200000,11760372,0,stellar-search,150,0.0267,4,50,5599.2,88661
4,0 days 15:40:35.630000,10417456,0,stellar-search,150,0.04,1,50,56435.63,160137
5,0 days 00:15:07.040000,11697420,0,stellar-search,150,0.0267,4,25,907.04,65347


In [235]:
for er in error_rates:
    er_rate_runs = stellar_mouse[stellar_mouse["error-rate"] == er]
    print(er)
    print("Corr repeat-period\t" + str(er_rate_runs["repeat-period"].corr(er_rate_runs["matches"])))
    print("Corr repeat-length\t" + str(er_rate_runs["repeat-length"].corr(er_rate_runs["matches"])))
    print("Corr time\t" + str(er_rate_runs["time"].corr(er_rate_runs["matches"])))

0.0267
Corr repeat-period	-0.5749652444761867
Corr repeat-length	0.2869991179202832
Corr time	0.6783328354499112
0.033
Corr repeat-period	-0.5188064722046153
Corr repeat-length	0.26764405714678297
Corr time	0.7731811961995414
0.04
Corr repeat-period	-0.4415139176838639
Corr repeat-length	0.23558846650590795
Corr time	0.6670797876432318


In [236]:
stellar_default = stellar_mouse[(stellar_mouse["repeat-period"] == 1) & (stellar_mouse["repeat-length"] == 1000)]
stellar_default

Unnamed: 0,time,mem,error-code,command,min-len,error-rate,repeat-period,repeat-length,seconds,matches
0,0 days 02:08:42,10005920,0,stellar-search,150,0.0267,1,1000,7722.0,103208
6,0 days 06:07:46.030000,10307376,0,stellar-search,150,0.033,1,1000,22066.03,119991
43,0 days 15:30:48.120000,10419560,0,stellar-search,150,0.04,1,1000,55848.12,160137


In [237]:
stellar_mouse[stellar_mouse["error-rate"] < 0.04]

Unnamed: 0,time,mem,error-code,command,min-len,error-rate,repeat-period,repeat-length,seconds,matches
0,0 days 02:08:42,10005920,0,stellar-search,150,0.0267,1,1000,7722.0,103208
1,0 days 00:17:04.210000,11743712,0,stellar-search,150,0.033,4,25,1024.21,91899
2,0 days 01:33:19.200000,11760372,0,stellar-search,150,0.0267,4,50,5599.2,88661
5,0 days 00:15:07.040000,11697420,0,stellar-search,150,0.0267,4,25,907.04,65347
6,0 days 06:07:46.030000,10307376,0,stellar-search,150,0.033,1,1000,22066.03,119991
8,0 days 02:07:01.320000,11734356,0,stellar-search,150,0.0267,2,1000,7621.32,103208
10,0 days 06:07:39.110000,11836940,0,stellar-search,150,0.033,2,100,22059.11,119991
11,0 days 01:33:22.640000,11764224,0,stellar-search,150,0.0267,4,50,5602.64,88661
12,0 days 06:07:32.750000,10295500,0,stellar-search,150,0.033,1,100,22052.75,119991
13,0 days 02:09:45.440000,11791284,0,stellar-search,150,0.0267,4,100,7785.44,103207


In [238]:
stellar_true_matches = stellar_default[["error-rate", "matches"]] 
stellar_true_matches.columns = ["error-rate", "stellar-matches"]
stellar_true_matches

Unnamed: 0,error-rate,stellar-matches
0,0.0267,103208
6,0.033,119991
43,0.04,160137


## Blast

In [239]:
print(error_rates)

[0.0267 0.033  0.04  ]


In [240]:
min_len = 150
kmer_list = []
error_list = [0.0067, 0.013, 0.02, 0.0267, 0.033, 0.04, 0.0467, 0.053]
for er in error_list:
    errors = int(round(er * min_len))
    for k in range(51, 6, -1):
        if ((min_len - k + 1 - errors * k) > 2):
            kmer_list.append(k) 
            break
            
d = {'k': kmer_list, 'error-rate': error_list}            
error_df = pd.DataFrame(data=d)
print(error_df)

    k  error-rate
0  51      0.0067
1  49      0.0130
2  37      0.0200
3  29      0.0267
4  24      0.0330
5  21      0.0400
6  18      0.0467
7  16      0.0530


In [241]:
blast_time = pd.read_csv("blast_mouse.time", sep = "\t")
blast_time = blast_time[blast_time["command"] == "blast-search"]
blast_time.threads = blast_time.threads.astype(int)
blast_time.k = blast_time.k.astype(int)
blast_time.drop("matches", axis = 1, inplace = True)
make_timedelta(blast_time)
blast_time.head()

Unnamed: 0,time,mem,error-code,command,threads,evalue,k,seconds
1,0 days 00:01:05.580000,1032700,0,blast-search,8,10.0,49,65.58
2,0 days 00:02:56.420000,1761172,0,blast-search,8,0.1,21,176.42
3,0 days 00:01:20.390000,1126004,0,blast-search,8,0.01,37,80.39
4,0 days 00:04:49.190000,2274744,0,blast-search,8,10.0,18,289.19
5,0 days 00:10:43.640000,2055048,0,blast-search,8,0.01,16,643.64


In [242]:
blast_matches = pd.read_csv("blast_mouse_match_count.tsv", sep = "\t")
blast_matches.head()

Unnamed: 0,evalue,k,matches
0,10.0,16,280930
1,0.1,51,690
2,0.01,21,12096
3,1.0,21,43121
4,1.0,28,13536


In [243]:
blast_mouse = pd.merge(blast_time, blast_matches, how="left", on=["evalue", "k"])
blast_mouse = pd.merge(blast_mouse, error_df, how="left", on = ["k"])
blast_mouse = pd.merge(blast_mouse, stellar_true_matches, how="left", on = ["error-rate"])
blast_mouse['stellar-matches'] = blast_mouse['stellar-matches'].fillna(0).astype(int)
blast_mouse.head()

Unnamed: 0,time,mem,error-code,command,threads,evalue,k,seconds,matches,error-rate,stellar-matches
0,0 days 00:01:05.580000,1032700,0,blast-search,8,10.0,49,65.58,809,0.013,0
1,0 days 00:02:56.420000,1761172,0,blast-search,8,0.1,21,176.42,17607,0.04,160137
2,0 days 00:01:20.390000,1126004,0,blast-search,8,0.01,37,80.39,1779,0.02,0
3,0 days 00:04:49.190000,2274744,0,blast-search,8,10.0,18,289.19,185203,0.0467,0
4,0 days 00:10:43.640000,2055048,0,blast-search,8,0.01,16,643.64,27491,0.053,0


In [244]:
len(blast_mouse[(blast_mouse["error-code"] != 0) & (blast_mouse["matches"] > 0)]) == 0

True

In [245]:
len(blast_mouse[(blast_mouse["error-code"] == 0) & (blast_mouse["matches"] == 0)]) == 0

True

In [246]:
blast_mouse[blast_mouse["evalue"] == 10]

Unnamed: 0,time,mem,error-code,command,threads,evalue,k,seconds,matches,error-rate,stellar-matches
0,0 days 00:01:05.580000,1032700,0,blast-search,8,10.0,49,65.58,809,0.013,0
3,0 days 00:04:49.190000,2274744,0,blast-search,8,10.0,18,289.19,185203,0.0467,0
7,0 days 00:01:20.070000,1101496,0,blast-search,8,10.0,37,80.07,1779,0.02,0
10,0 days 00:11:06.390000,2360136,0,blast-search,8,10.0,16,666.39,280930,0.053,0
15,0 days 00:02:23.540000,2112456,0,blast-search,8,10.0,24,143.54,44886,0.033,119991
16,0 days 00:03:03.470000,2185612,0,blast-search,8,10.0,21,183.47,101422,0.04,160137
25,0 days 00:01:43.830000,1421192,0,blast-search,8,10.0,29,103.83,14515,0.0267,103208
29,0 days 00:00:59.500000,1027924,0,blast-search,8,10.0,51,59.5,690,0.0067,0
32,0 days 00:01:15.670000,1128268,0,blast-search,16,10.0,49,75.67,809,0.013,0
34,0 days 00:03:34.620000,3231520,0,blast-search,16,10.0,18,214.62,185203,0.0467,0


In [247]:
blast_mouse

Unnamed: 0,time,mem,error-code,command,threads,evalue,k,seconds,matches,error-rate,stellar-matches
0,0 days 00:01:05.580000,1032700,0,blast-search,8,10.00,49,65.58,809,0.0130,0
1,0 days 00:02:56.420000,1761172,0,blast-search,8,0.10,21,176.42,17607,0.0400,160137
2,0 days 00:01:20.390000,1126004,0,blast-search,8,0.01,37,80.39,1779,0.0200,0
3,0 days 00:04:49.190000,2274744,0,blast-search,8,10.00,18,289.19,185203,0.0467,0
4,0 days 00:10:43.640000,2055048,0,blast-search,8,0.01,16,643.64,27491,0.0530,0
...,...,...,...,...,...,...,...,...,...,...,...
59,0 days 00:02:39.610000,2854188,0,blast-search,16,1.00,21,159.61,43121,0.0400,160137
60,0 days 00:00:56.590000,1092388,0,blast-search,16,1.00,51,56.59,690,0.0067,0
61,0 days 00:00:55.340000,1081980,0,blast-search,16,10.00,51,55.34,690,0.0067,0
62,0 days 00:03:12.790000,3029936,0,blast-search,16,1.00,18,192.79,72101,0.0467,0


## DREAM-Stellar

In [291]:
dream_entropy_max_er_time = pd.read_csv("error_rates/entropy/search_valik_max_er.time", sep = "\t")
dream_entropy_max_er_time["repeat-mask"] = "entropy"
#dream_entropy_max_er_time.head()

In [292]:
dream_entropy_time = pd.read_csv("error_rates/entropy/search_valik.time", sep = "\t")
dream_entropy_time.drop("matches", axis = 1, inplace = True)
#dream_entropy_time.head()

In [293]:
dream_same = dream_entropy_time.drop(["time", "mem", "repeats"], axis = 1)
print(len(dream_same))
dream_same.drop_duplicates()
print(len(dream_same))

86
86


In [294]:
dream_entropy_matches = pd.read_csv("error_rates/entropy/valik_mouse_match_count.tsv", sep = "\t")
dream_entropy_matches = dream_entropy_matches[dream_entropy_matches["repeat-period"] == 1]
dream_entropy_matches = dream_entropy_matches[dream_entropy_matches["repeat-length"] == 1000]
#dream_entropy_matches.head()

In [295]:
dream_entropy_mouse = pd.merge(dream_entropy_time, dream_entropy_matches, how="left", on=["bins", "cmax", "error-rate", "bin-entropy-cutoff", "max-carts"])
dream_entropy_mouse["repeat-mask"] = "entropy"
#dream_entropy_mouse.head()

In [296]:
dream_pattern_count_mouse = pd.read_csv("error_rates/pattern_count/search_valik.time", sep = "\t")
dream_pattern_count_mouse["repeat-mask"] = "pattern-count"
#dream_pattern_count_mouse.head()

In [298]:
dream_mouse = pd.concat([dream_entropy_mouse, dream_pattern_count_mouse, dream_entropy_max_er_time])
dream_mouse.drop(["fpr", "minimiser", "repeat-flag", "repeat-period", "repeat-length", "cmin"], axis = 1, inplace = True)
dream_mouse['repeats'] = dream_mouse['repeats'].fillna(0).astype(int)
dream_mouse['matches'] = dream_mouse['matches'].fillna(0).astype(int)
make_timedelta(dream_mouse)
dream_mouse.head()

Unnamed: 0,time,mem,error-code,command,bins,max-er,min-len,threads,cmax,error-rate,bin-entropy-cutoff,cart-max-cap,max-carts,repeats,matches,repeat-mask,kmer-size,seconds
0,0 days 00:02:08.300000,17004320,0,valik-search,1024,0.053,150,32,254,0.0267,0.75,20000,1024,236,72657,entropy,,128.3
1,0 days 00:00:50.710000,13476048,0,valik-search,1024,0.053,150,32,254,0.013,0.75,20000,1024,97,30809,entropy,,50.71
2,0 days 00:02:51.960000,16882712,0,valik-search,1024,0.053,150,32,254,0.033,0.75,20000,1024,236,87220,entropy,,171.96
3,0 days 00:00:43.770000,13307064,0,valik-search,1024,0.053,150,32,254,0.0067,0.75,20000,1024,97,21851,entropy,,43.77
4,0 days 00:07:45.850000,318329224,6,valik-search,1024,0.053,150,32,254,0.053,0.75,20000,1024,0,0,entropy,,465.85


In [299]:
dream_mouse[(dream_mouse["error-rate"] == 0.053) & (dream_mouse["error-code"] != 0)]

Unnamed: 0,time,mem,error-code,command,bins,max-er,min-len,threads,cmax,error-rate,bin-entropy-cutoff,cart-max-cap,max-carts,repeats,matches,repeat-mask,kmer-size,seconds
4,0 days 00:07:45.850000,318329224,6,valik-search,1024,0.053,150,32,254,0.053,0.75,20000,1024,0,0,entropy,,465.85
9,0 days 00:40:31.760000,273042760,6,valik-search,2048,0.053,150,32,254,0.053,0.25,20000,1024,0,0,entropy,,2431.76
12,0 days 00:08:21.510000,236233252,6,valik-search,1024,0.053,150,32,254,0.053,0.1,20000,4096,0,38307,entropy,,501.51
19,0 days 00:17:50.990000,233642624,6,valik-search,1024,0.053,150,32,150,0.053,0.1,20000,4096,0,38316,entropy,,1070.99
74,0 days 00:10:21.630000,236061552,6,valik-search,1024,0.053,150,32,254,0.053,0.1,20000,2048,0,0,entropy,,621.63
83,0 days 00:54:11.540000,319431016,6,valik-search,1024,0.053,150,32,254,0.053,0.75,20000,1024,0,0,entropy,,3251.54
7,0 days 00:06:44.940000,220438820,6,valik-search,1024,0.053,150,64,254,0.053,,20000,1024,0,0,pattern-count,16.0,404.94
13,0 days 00:52:34.510000,216128944,6,valik-search,2048,0.053,150,32,254,0.053,,20000,2048,0,0,pattern-count,15.0,3154.51
0,0 days 00:08:54.040000,317342152,6,valik-search,1024,0.053,150,32,254,0.053,0.75,20000,2048,0,0,entropy,,534.04


In [300]:
dream_mouse = pd.merge(dream_mouse, stellar_true_matches, how="left", on = ["error-rate"])
dream_mouse["stellar-matches"] = dream_mouse['stellar-matches'].fillna(0).astype(int)

In [311]:
np.mean(dream_mouse[(dream_mouse["error-rate"] == 0.02) & (dream_mouse["bins"] == 2048) & (dream_mouse["threads"] == 32)]["matches"])

26114.75

In [325]:
get_highest_accuracy(dream_mouse)

Unnamed: 0,time,mem,error-code,command,bins,max-er,min-len,threads,cmax,error-rate,bin-entropy-cutoff,cart-max-cap,max-carts,repeats,matches,repeat-mask,kmer-size,seconds,stellar-matches
0,0 days 00:00:31.180000,11811916,0,valik-search,1024,0.0067,150,32,254,0.0067,0.75,20000,2048,45,22927,entropy,,31.18,0
1,0 days 00:00:35.650000,11869628,0,valik-search,1024,0.013,150,32,254,0.013,0.75,20000,2048,45,32001,entropy,,35.65,0
2,0 days 00:00:52.140000,12687384,0,valik-search,1024,0.02,150,32,254,0.02,0.75,20000,2048,54,53067,entropy,,52.14,0
3,0 days 00:01:25.360000,13771276,0,valik-search,1024,0.0267,150,32,254,0.0267,0.75,20000,2048,76,74376,entropy,,85.36,103208
4,0 days 00:01:36.760000,13682624,0,valik-search,1024,0.033,150,32,254,0.033,0.75,20000,2048,76,88682,entropy,,96.76,119991
5,0 days 00:02:48.900000,15561524,0,valik-search,1024,0.04,150,32,254,0.04,0.75,20000,2048,139,122960,entropy,,168.9,160137
6,0 days 01:21:27.780000,318939592,0,valik-search,1024,0.0467,150,32,254,0.0467,0.75,20000,2048,12151,144958,entropy,,4887.78,0
7,0 days 00:28:21.250000,185189236,0,valik-search,4096,0.053,150,32,150,0.053,0.1,20000,4096,5791,48505,entropy,,1701.25,0


# Human vs mouse reference genomes

## Blast

In [None]:
blast_human = pd.read_csv("blast_human.time", sep = "\t")
blast_human = blast_human[(blast_human["error-code"] == 0) & (blast_human["command"] == "blast-search")]
blast_human.k = blast_human.k.astype(int)
blast_human.matches = blast_human.matches.astype(int)
blast_human.head()

In [None]:
len(blast_mouse[(blast_mouse["error-code"] != 0) & (blast_mouse["matches"] > 0)]) == 0

In [None]:
len(blast_mouse[(blast_mouse["error-code"] == 0) & (blast_mouse["matches"] == 0)]) == 0

## DREAM-Stellar

In [None]:
dream_time = pd.read_csv("dream_human.time", sep = "\t")
#dream_time = pd.concat(dream_low_err_time)
dream_time = dream_time[dream_time["error-code"] == 0]
dream_time.head()

In [None]:
dream_time["error-rate"]

In [None]:
dream_low_err_time = pd.read_csv("dream_e0.013_human.time", sep = '\t')
dream_low_err_time.head()
dream_low_err_time["matches"] / dream_low_err_time["time"]

In [None]:
dream_low_err_time