In [1]:
import pandas as pd
from pandas import DataFrame, Series
#false positive warnings all the time:
pd.options.mode.chained_assignment = None

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

import numpy as np
import statistics

import os.path

In [2]:
#http://matplotlib.org/1.4.0/users/customizing.html

plt.style.use('ggplot')

mpl.rcParams['font.size'] = 18

### LINES
mpl.rcParams['lines.linewidth'] = '2.0'        # line width in points
mpl.rcParams['lines.marker']    = 'None'
mpl.rcParams['lines.markersize']=  '6'          # markersize, in points

###LEGEND
mpl.rcParams['legend.fontsize']='medium'

### GRIDS
mpl.rcParams['grid.color']='black'     # grid color
mpl.rcParams['grid.linestyle'] =':'    # dotted
mpl.rcParams['grid.linewidth']='0.5'   # in points
mpl.rcParams['grid.alpha']='1.0'       # transparency, between 0.0 and 1.0

### TICKS
mpl.rcParams['xtick.labelsize']="small"  # fontsize of the tick labels
mpl.rcParams['xtick.direction']="out"      # direction: in, out, or inout
mpl.rcParams['ytick.labelsize']="small"   # fontsize of the tick labels
mpl.rcParams['ytick.direction']="out"      # direction: in, out, or inout

### AXES
mpl.rcParams['axes.linewidth']  =  '1.0'    # edge linewidth
mpl.rcParams['axes.grid']       =  'True'   # display grid or not
mpl.rcParams['axes.titlesize']  =  'large'  # fontsize of the axes title
mpl.rcParams['axes.labelsize']  = 'large'  # fontsize of the x any y labels
mpl.rcParams['axes.facecolor']= 'white'     # axes background color

mpl.rcParams['figure.facecolor']= '1.0'     # figure facecolor; 0.75 is scalar gray

# Get the results of queries with only 1 result (typically count queries)

* Preprocessing: the results directory contains gigabytes of data, let's filter out the files with only one line

### Tryout

In [3]:
path = './query_results_analysis/SparqlResults_VirtuosoN1_64_Ontoforce_Optimized_VWall/'



sim_id = 'Vir_N1_64_Ont_Opt_VWall'

info_per_file = []

for filename in os.listdir(path):
    
    info = {}
    
    info['sim_id'] = sim_id
        
    removedextension = filename[:-4]
    spl = removedextension.split("_")
    info['query_hash'] = spl[0]
    info['query_timestamp'] = spl[1]
    
    full_path = path + filename
    
    first = None
    
    with open(full_path, 'r') as infile:
        c=0
        for line in infile:
            c+=1
            if c==1:
                first = line
    
    info['num_results'] = c
    info['first_result'] = first
    
    #print(info)
    info_per_file.append(info)    
        

        
df = pd.DataFrame(info_per_file)        



In [4]:
df = df[df['num_results'] == 1]

grouped =  df.groupby(['query_hash'], as_index=False)
df_consistent =grouped.agg({'query_timestamp': np.size, 'first_result': pd.Series.nunique, 'num_results': pd.Series.nunique})
    
print(df_consistent.shape)

print(df_consistent[ (df_consistent['first_result'] > 1) | (df_consistent['num_results'] > 1)].shape)

(829, 4)
(0, 4)


## 1. Inter-thread consistency

In [5]:
def getQueryHashDataframe(path, sim_id):
    
    info_per_file = []

    for filename in os.listdir(path):
    
        info = {}

        info['sim_id'] = sim_id

        removedextension = filename[:-4]
        spl = removedextension.split("_")
        info['query_hash'] = spl[0]
        info['query_timestamp'] = spl[1]

        full_path = path + filename

        first = None

        with open(full_path, 'r') as infile:
            c=0
            for line in infile:
                c+=1
                if c==1:
                    first = line

        info['num_results'] = c
        info['first_result'] = first

        #print(info)
        info_per_file.append(info)    
        
               
    return pd.DataFrame(info_per_file)      

def getInconsitenciesBetweenThreads(df):
    df = df[df['num_results'] == 1]

    grouped =  df.groupby(['query_hash'], as_index=False)
    df_consistent =grouped.agg({'query_timestamp': np.size, 'first_result': pd.Series.nunique, 'num_results': pd.Series.nunique})
    

    return df_consistent[ (df_consistent['first_result'] > 1) | (df_consistent['num_results'] > 1)]

### Query results are consistent between threads

* print the dataframes holding the inconsistent results (there are none)

In [6]:
paths = [ './query_results_analysis/SparqlResults_VirtuosoN1_64_Ontoforce_Optimized_VWall/',
'./query_results_analysis/SparqlResults_VirtuosoN3_Ontoforce_AWS1/',
'./query_results_analysis/SparqlResults_VirtuosoN3_Ontoforce_AWS2/',
'./query_results_analysis/SparqlResults_Virtuoso_N1_32_Ontoforce_Optimized_VWall/',
'./query_results_analysis/SparqlResults_Virtuoso_N3_64_Ontoforce_Optimized_AWS3/']

sim_ids = ['Vir_N1_64_Ont_Opt_VWall', 'Vir_N3_64_Ont_Opt_AWS1', 'Vir_N3_64_Ont_Opt_AWS2', 'Vir_N1_32_Ont_Opt_VWall', \
           'Vir_N3_64_Ont_Opt_AWS3']

query_hash_dfs = []
for p,sim in zip(paths,sim_ids):
    df = getQueryHashDataframe(p,sim)
    query_hash_dfs.append(df)
    print(getInconsitenciesBetweenThreads(df))

Empty DataFrame
Columns: [query_hash, query_timestamp, first_result, num_results]
Index: []
Empty DataFrame
Columns: [query_hash, query_timestamp, first_result, num_results]
Index: []
Empty DataFrame
Columns: [query_hash, query_timestamp, first_result, num_results]
Index: []
Empty DataFrame
Columns: [query_hash, query_timestamp, first_result, num_results]
Index: []
Empty DataFrame
Columns: [query_hash, query_timestamp, first_result, num_results]
Index: []


## 2. Results per simulation => dataframe

In [7]:
def cleanupQueryHashDF(df):
    df = df[df['num_results'] == 1]
    df_filtered = df.groupby('query_hash', as_index=False).first()
    df_filtered['first_result'] = df_filtered['first_result'].apply(lambda r: r.strip())
    return df_filtered[['sim_id', 'query_hash', 'first_result']]

In [18]:
dfs = [cleanupQueryHashDF(df) for df in query_hash_dfs]

df_all = pd.concat(dfs)

df_hash_sorted = df_all.sort_values(['query_hash'])
df_hash_sorted.to_csv('./queryhashanalysis/CountQueryConsistency.csv', sep='\t', index=False)
df_hash_sorted

Unnamed: 0,sim_id,query_hash,first_result
0,Vir_N1_64_Ont_Opt_VWall,100037504,"http://purl.obolibrary.org/obo/ERO_0000229,"
0,Vir_N1_32_Ont_Opt_VWall,100037504,"http://purl.obolibrary.org/obo/ERO_0000229,"
1,Vir_N1_32_Ont_Opt_VWall,1001476692,1656^^http://www.w3.org/2001/XMLSchema#integer...
1,Vir_N1_64_Ont_Opt_VWall,1001476692,1656^^http://www.w3.org/2001/XMLSchema#integer...
2,Vir_N1_32_Ont_Opt_VWall,1003408745,"278^^http://www.w3.org/2001/XMLSchema#integer,"
0,Vir_N3_64_Ont_Opt_AWS3,1003408745,"278^^http://www.w3.org/2001/XMLSchema#integer,"
2,Vir_N1_64_Ont_Opt_VWall,1003408745,"278^^http://www.w3.org/2001/XMLSchema#integer,"
3,Vir_N1_32_Ont_Opt_VWall,1004308409,"0^^http://www.w3.org/2001/XMLSchema#integer,"
3,Vir_N1_64_Ont_Opt_VWall,1004308409,"0^^http://www.w3.org/2001/XMLSchema#integer,"
4,Vir_N1_64_Ont_Opt_VWall,100604759,"0^^http://www.w3.org/2001/XMLSchema#integer,"


Unnamed: 0,sim_id,query_hash,first_result
0,Vir_N1_64_Ont_Opt_VWall,100037504,"http://purl.obolibrary.org/obo/ERO_0000229,"
0,Vir_N1_32_Ont_Opt_VWall,100037504,"http://purl.obolibrary.org/obo/ERO_0000229,"
1,Vir_N1_32_Ont_Opt_VWall,1001476692,1656^^http://www.w3.org/2001/XMLSchema#integer...
1,Vir_N1_64_Ont_Opt_VWall,1001476692,1656^^http://www.w3.org/2001/XMLSchema#integer...
2,Vir_N1_32_Ont_Opt_VWall,1003408745,"278^^http://www.w3.org/2001/XMLSchema#integer,"
0,Vir_N3_64_Ont_Opt_AWS3,1003408745,"278^^http://www.w3.org/2001/XMLSchema#integer,"
2,Vir_N1_64_Ont_Opt_VWall,1003408745,"278^^http://www.w3.org/2001/XMLSchema#integer,"
3,Vir_N1_32_Ont_Opt_VWall,1004308409,"0^^http://www.w3.org/2001/XMLSchema#integer,"
3,Vir_N1_64_Ont_Opt_VWall,1004308409,"0^^http://www.w3.org/2001/XMLSchema#integer,"
4,Vir_N1_64_Ont_Opt_VWall,100604759,"0^^http://www.w3.org/2001/XMLSchema#integer,"


# 3. Check for inconsistencies

* Only one inconsistency found, and confirmed that it is not a count query

In [None]:
dfconsistent = df_all.groupby('query_hash', as_index=False).agg({'sim_id': np.size, 'first_result': pd.Series.nunique})



In [None]:
print(str(dfconsistent[dfconsistent['sim_id'] >1].shape[0]) + " queries occur in 2 or more sims")

dfconsistent[dfconsistent['first_result'] >1]