## Experiment results postprocessing and filtering

For each database (db), record volume (record_volume) and query type (query), we remove the iteration with the minimum and maximum execution time (time_in_seconds). We also process some inconsistencies and include additional labels to the data.

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('results.csv')

df["old_time_in_seconds"] = df["time_in_seconds"].copy()

df

Unnamed: 0,db,record_volume,query,iteration,time_in_seconds,old_time_in_seconds
0,sqlite,1000,1-1,0,0.001000,0.001000
1,sqlite,1000,1-1,1,0.001000,0.001000
2,sqlite,1000,1-1,2,0.001000,0.001000
3,sqlite,1000,1-1,3,0.001000,0.001000
4,sqlite,1000,1-1,4,0.001000,0.001000
...,...,...,...,...,...,...
12880,cassandra,1024000,9-2,16,0.001820,0.001820
12881,cassandra,1024000,9-2,17,0.001939,0.001939
12882,cassandra,1024000,9-2,18,0.003948,0.003948
12883,cassandra,1024000,9-2,19,0.001888,0.001888


In [2]:
df.drop(df[(df["db"] == "cassandra") & (df["record_volume"] != 1000) & (df["query"] == "3-1")].index, inplace=True)
df[(df["db"] == "cassandra") & (df["record_volume"] != 1000)]

Unnamed: 0,db,record_volume,query,iteration,time_in_seconds,old_time_in_seconds
11224,cassandra,4000,10,1,0.001860,0.001860
11225,cassandra,4000,10,2,0.002380,0.002380
11226,cassandra,4000,10,3,0.002416,0.002416
11227,cassandra,4000,10,4,0.002605,0.002605
11228,cassandra,4000,10,5,0.002413,0.002413
...,...,...,...,...,...,...
12880,cassandra,1024000,9-2,16,0.001820,0.001820
12881,cassandra,1024000,9-2,17,0.001939,0.001939
12882,cassandra,1024000,9-2,18,0.003948,0.003948
12883,cassandra,1024000,9-2,19,0.001888,0.001888


In [3]:
df.loc[df["db"].isin(["mysql", "sqlite"]) & (df["query"] == "4-2"), "time_in_seconds"] = -1
df.loc[df["db"].isin(["mysql", "sqlite"]) & (df["query"] == "4-2")]

Unnamed: 0,db,record_volume,query,iteration,time_in_seconds,old_time_in_seconds
260,sqlite,1000,4-2,0,-1.0,-1.0
261,sqlite,1000,4-2,1,-1.0,-1.0
262,sqlite,1000,4-2,2,-1.0,-1.0
263,sqlite,1000,4-2,3,-1.0,-1.0
264,sqlite,1000,4-2,4,-1.0,-1.0
...,...,...,...,...,...,...
3915,mysql,128000,4-2,17,-1.0,-1.0
3916,mysql,128000,4-2,18,-1.0,-1.0
3917,mysql,128000,4-2,19,-1.0,-1.0
3918,mysql,128000,4-2,20,-1.0,-1.0


In [4]:
dnf = df[df["time_in_seconds"] < 0]
dnf

Unnamed: 0,db,record_volume,query,iteration,time_in_seconds,old_time_in_seconds
260,sqlite,1000,4-2,0,-1.0,-1.0
261,sqlite,1000,4-2,1,-1.0,-1.0
262,sqlite,1000,4-2,2,-1.0,-1.0
263,sqlite,1000,4-2,3,-1.0,-1.0
264,sqlite,1000,4-2,4,-1.0,-1.0
...,...,...,...,...,...,...
10679,mongodb,1024000,3-1,-1,-1.0,-1.0
10700,mongodb,1024000,3-3,-1,-1.0,-1.0
10721,mongodb,1024000,4-1,-1,-1.0,-1.0
10722,mongodb,1024000,4-2,-1,-1.0,-1.0


In [5]:
# df.drop(df[df["time_in_seconds"] < 0].index, inplace=True)
dropped_duplicate_dnf = df[df["time_in_seconds"] < 0].drop_duplicates(subset=["db", "record_volume", "query", "time_in_seconds"])
dropped_duplicate_dnf = dropped_duplicate_dnf.replace({"time_in_seconds": -1}, "DNF")

df = pd.concat([df.drop(dnf.index), dropped_duplicate_dnf]).sort_index()
df

Unnamed: 0,db,record_volume,query,iteration,time_in_seconds,old_time_in_seconds
0,sqlite,1000,1-1,0,0.001,0.001000
1,sqlite,1000,1-1,1,0.001,0.001000
2,sqlite,1000,1-1,2,0.001,0.001000
3,sqlite,1000,1-1,3,0.001,0.001000
4,sqlite,1000,1-1,4,0.001,0.001000
...,...,...,...,...,...,...
12880,cassandra,1024000,9-2,16,0.00182,0.001820
12881,cassandra,1024000,9-2,17,0.001939,0.001939
12882,cassandra,1024000,9-2,18,0.003948,0.003948
12883,cassandra,1024000,9-2,19,0.001888,0.001888


In [6]:
df[df["time_in_seconds"] == "DNF"]

Unnamed: 0,db,record_volume,query,iteration,time_in_seconds,old_time_in_seconds
260,sqlite,1000,4-2,0,DNF,-1.0
660,sqlite,4000,4-2,1,DNF,-1.0
960,sqlite,256000,3-1,1,DNF,-1.0
981,sqlite,256000,3-3,1,DNF,-1.0
1002,sqlite,256000,4-1,1,DNF,-1.0
...,...,...,...,...,...,...
10679,mongodb,1024000,3-1,-1,DNF,-1.0
10700,mongodb,1024000,3-3,-1,DNF,-1.0
10721,mongodb,1024000,4-1,-1,DNF,-1.0
10722,mongodb,1024000,4-2,-1,DNF,-1.0


In [7]:
gb = df.groupby(["db", "record_volume", "query"])["time_in_seconds"]
gb.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,unique,top,freq
db,record_volume,query,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
arangodb,1000,1-1,20.0,20.0,0.001766,1.0
arangodb,1000,1-2,20.0,20.0,0.001571,1.0
arangodb,1000,1-3,20.0,20.0,0.00049,1.0
arangodb,1000,1-4,20.0,20.0,0.005032,1.0
arangodb,1000,10,20.0,20.0,0.030094,1.0
...,...,...,...,...,...,...
sqlite,1024000,6,20.0,20.0,28.437,1.0
sqlite,1024000,7,20.0,20.0,25.457,1.0
sqlite,1024000,8,20.0,18.0,2.34,2.0
sqlite,1024000,9-1,20.0,19.0,2.64,2.0


In [8]:
idxmin = gb.idxmin()
df.loc[idxmin]

Unnamed: 0,db,record_volume,query,iteration,time_in_seconds,old_time_in_seconds
6513,arangodb,1000,1-1,17,0.000414,0.000414
6529,arangodb,1000,1-2,13,0.001397,0.001397
6541,arangodb,1000,1-3,5,0.00025,0.000250
6574,arangodb,1000,1-4,18,0.00142,0.001420
6874,arangodb,1000,10,18,0.029474,0.029474
...,...,...,...,...,...,...
2060,sqlite,1024000,6,8,19.329,19.329000
2087,sqlite,1024000,7,15,21.819,21.819000
2093,sqlite,1024000,8,1,2.141,2.141000
2132,sqlite,1024000,9-1,20,2.612,2.612000


In [9]:
idxmax = gb.idxmax()
df.loc[idxmax]

Unnamed: 0,db,record_volume,query,iteration,time_in_seconds,old_time_in_seconds
6496,arangodb,1000,1-1,0,0.001766,0.001766
6531,arangodb,1000,1-2,15,0.002755,0.002755
6543,arangodb,1000,1-3,7,0.000558,0.000558
6556,arangodb,1000,1-4,0,0.005032,0.005032
6865,arangodb,1000,10,9,0.036892,0.036892
...,...,...,...,...,...,...
2053,sqlite,1024000,6,1,28.437,28.437000
2073,sqlite,1024000,7,1,25.457,25.457000
2108,sqlite,1024000,8,16,2.369,2.369000
2117,sqlite,1024000,9-1,5,3.71,3.710000


In [10]:
df["time_in_seconds"] = df["time_in_seconds"].loc[(~df.index.isin(idxmin) & ~df.index.isin(idxmax)) | (df["time_in_seconds"] == "DNF")]
df["extra_label"] = df["time_in_seconds"][df["time_in_seconds"] == "DNF"].copy()
df.replace({"time_in_seconds": "DNF"}, 300, inplace=True)
df

  df.replace({"time_in_seconds": "DNF"}, 300, inplace=True)


Unnamed: 0,db,record_volume,query,iteration,time_in_seconds,old_time_in_seconds,extra_label
0,sqlite,1000,1-1,0,,0.001000,
1,sqlite,1000,1-1,1,0.001000,0.001000,
2,sqlite,1000,1-1,2,0.001000,0.001000,
3,sqlite,1000,1-1,3,0.001000,0.001000,
4,sqlite,1000,1-1,4,0.001000,0.001000,
...,...,...,...,...,...,...,...
12880,cassandra,1024000,9-2,16,0.001820,0.001820,
12881,cassandra,1024000,9-2,17,0.001939,0.001939,
12882,cassandra,1024000,9-2,18,,0.003948,
12883,cassandra,1024000,9-2,19,0.001888,0.001888,


In [11]:
df.loc[(df["db"] == "mongodb") & (df["record_volume"] >= 256000) & (df["query"] == "3-1"), "extra_label"] = "DNF,#1"
df.loc[(df["db"] == "mongodb") & (df["record_volume"] >= 256000) & (df["query"] == "4-2"), "extra_label"] = "DNF,#2"

In [12]:
df.to_csv('results_filtered.csv', index=False)