In [None]:
import os
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

In [37]:
# results_path = "/Users/wvw/git/n3/sparql2n3/SPARQL-to-N3/gmark_50_new/mix"
results_path = "/Users/wvw/git/n3/sparql2n3/SPARQL-to-N3/SPIN-to-N3/property-paths/zika"

## Load results

In [38]:
def load_sparql_times(system, data_size):
    sparql_times = f"{results_path}/results/{system}/{data_size}/times_{system}.csv"
    df_sparql = pd.read_csv(sparql_times)
    df_sparql['total_time'] = df_sparql['load_time'] + df_sparql['exec_time']
    
    return df_sparql


def load_sparql_times_agg(system, data_size):
    df_sparql = load_sparql_times(system, data_size)
    df_sparql_agg = df_sparql.groupby('query')[['load_time', 'exec_time', 'total_time']].mean().reset_index()
    df_sparql_agg['nr'] = df_sparql_agg['query'].str.slice(len("query-"), -len(".sparql"))
    
    return df_sparql_agg

In [39]:
def load_n3_times(type, data_size):
    n3_times = f"{results_path}/n3/results/{data_size}/times_n3.csv"
    df_n3 = pd.read_csv(n3_times)
    
    df_n3 = df_n3[df_n3['type']==type]
    
    df_n3['id'] = df_n3.apply(lambda x: f"{x['query']},{x['type']}", axis=1)
    df_n3_failed = df_n3[df_n3['reas_time']==-1]['id']
    df_n3_filt = df_n3[~df_n3['id'].isin(df_n3_failed)]
    
    df_n3_total = df_n3_filt[df_n3_filt['phase'].isna() | (df_n3_filt['phase']=='total')]
    df_n3_total['total_time'] = df_n3_total['netw_time'] + df_n3_total['reas_time']
    
    return df_n3_total


def load_n3_times_agg(type, data_size):
    df_n3_total = load_n3_times(type, data_size)    
    df_n3_agg = df_n3_total.groupby('query')[['netw_time', 'reas_time', 'total_time']].mean().reset_index()
    df_n3_agg['nr'] = df_n3_agg['query'].str.slice(len("query-"), -len(".n3"))

    return df_n3_agg

## Check outliers

In [None]:
def check_query_stdev(df, time_col, limit=10):
    for query in df['query'].unique():
        df_query = df[df['query']==query]
        stdev = df_query[time_col].describe()['std']
        if stdev > limit:
            print(query, stdev)
            print(df_query)
            print()

In [None]:
check_query_stdev(load_sparql_times('jena', 50), time_col='exec_time', limit=50)
check_query_stdev(load_sparql_times('jena', 100),time_col='exec_time',  limit=50)

## - jena, data_size 100:
# outliers (exec time): query-11 (100637 vs. avg. 153), query-2 (309138 vs. avg. 39551)
# (replaced with averages)

In [None]:
check_query_stdev(load_sparql_times('rdflib', 50), time_col='exec_time', limit=50)

In [None]:
check_query_stdev(load_sparql_times('rdf4j', 50), time_col='exec_time', limit=100)
check_query_stdev(load_sparql_times('rdf4j', 100), time_col='exec_time', limit=100)

In [None]:
check_query_stdev(load_n3_times('fwd', 50), time_col='reas_time', limit=50)
check_query_stdev(load_n3_times('fwd', 100), time_col='reas_time', limit=50)

## Check statistics

### Per query

In [50]:
load_sparql_times_agg('jena', '1_000_pt2')

Unnamed: 0,query,load_time,exec_time,total_time,nr
0,pp1.sparql,220.0,106.0,326.0,
1,pp2-2.sparql,212.0,86.0,298.0,
2,pp2.sparql,228.0,119.0,347.0,
3,pp3.sparql,218.0,146.0,364.0,
4,pp4-2.sparql,214.0,84.0,298.0,
5,pp4.sparql,218.0,117.0,335.0,


In [51]:
load_n3_times_agg('fwd', '1_000_pt2')

Unnamed: 0,query,netw_time,reas_time,total_time,nr
0,pp1.n3,3682.0,69.8,3751.8,
1,pp2-2.n3,3667.6,58.4,3726.0,
2,pp2.n3,3651.0,480.4,4131.4,
3,pp3.n3,3643.6,39516.8,43160.4,
4,pp4-2.n3,3654.6,92.2,3746.8,
5,pp4.n3,3639.2,570.8,4210.0,


In [52]:
load_n3_times_agg('bwd', '1_000_pt2')

Unnamed: 0,query,netw_time,reas_time,total_time,nr
0,pp1.n3,3692.6,50.6,3743.2,
1,pp2-2.n3,3666.6,15.2,3681.8,
2,pp2.n3,3663.6,311.2,3974.8,
3,pp3.n3,3644.8,346.8,3991.6,
4,pp4-2.n3,3649.8,22.2,3672.0,
5,pp4.n3,3642.0,505.6,4147.6,


In [53]:
load_n3_times_agg('direct', '1_000_pt2')

Unnamed: 0,query,netw_time,reas_time,total_time,nr
0,pp1.n3,3649.8,40.0,3689.8,
1,pp2-2.n3,3674.4,2.4,3676.8,
2,pp2.n3,3624.0,618.0,4242.0,
3,pp3.n3,3654.8,372.0,4026.8,
4,pp4-2.n3,3637.4,2.0,3639.4,
5,pp4.n3,3650.4,4774.0,8424.4,


### Overall

In [None]:
# desc_path = "/Users/wvw/git/n3/sparql2n3/SPARQL-to-N3/SPIN-to-N3/property-paths/results/gmark"
desc_path = "/Users/wvw/git/n3/sparql2n3/SPARQL-to-N3/SPIN-to-N3/property-paths/results/zika"

def describe_sparql(system, data_size, to_file=True):
    df = load_sparql_times_agg(system, data_size).describe()
    if not to_file:
        print(system, data_size, "\n", df)
    else:
        df.to_csv(os.path.join(desc_path, f"{system}-{data_size}.csv"))
    
def describe_n3(type, data_size, to_file=True):
    df = load_n3_times_agg(type, data_size).describe()
    if not to_file:
        print("n3", type, data_size, "\n", df)
    else:
        df.to_csv(os.path.join(desc_path, f"n3_{type}-{data_size}.csv"))

In [42]:
describe_sparql('jena', "1_000_pt2")
describe_sparql('jena', "10_000_pt2")
describe_sparql('jena', "100_000_pt2")

In [43]:
describe_n3('direct', "1_000_pt2")
describe_n3('bwd', "1_000_pt2")
describe_n3('fwd', "1_000_pt2")

In [None]:
describe_sparql('rdflib', 50)
describe_sparql('jena', 50)
describe_sparql('jena', 100)
describe_sparql('rdf4j', 50)
describe_sparql('rdf4j', 100)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/wvw/git/n3/sparql2n3/SPARQL-to-N3/SPIN-to-N3/property-paths/zika/results/jena/50/times_jena.csv'

In [None]:
describe_n3('direct', 50)
describe_n3('bwd', 50)
describe_n3('fwd', 50)
describe_n3('fwd', 100)

In [None]:
import matplotlib.pyplot as plt

def plot_systems(data_size, lim=None, incl_only=['n3_dir', 'n3_bwd', 'n3_fwd','jena', 'rdflib', 'rdf4j']):
    fig, ax = plt.subplots()
    if lim is not None:
        ax.set_ylim(0, lim)

    if 'n3_fwd' in incl_only:
        load_n3_times_agg('fwd', data_size).plot(x='nr', y='reas_time', ax=ax, label='fwd')
    if 'n3_bwd' in incl_only:
        load_n3_times_agg('bwd', data_size).plot(x='nr', y='reas_time', ax=ax, label='bwd')
    if 'n3_dir' in incl_only:
        load_n3_times_agg('direct', data_size).plot(x='nr', y='reas_time', ax=ax, label='direct')

    if 'jena' in incl_only:
        load_sparql_times_agg('jena', data_size).plot(x='nr', y='exec_time', ax=ax, label='jena')
    if 'rdflib' in incl_only:
        load_sparql_times_agg('rdflib', data_size).plot(x='nr', y='exec_time', ax=ax, label='rdflib')
    if 'rdf4j' in incl_only:
        load_sparql_times_agg('rdf4j', data_size).plot(x='nr', y='exec_time', ax=ax, label='rdf4j')

In [None]:
plot_systems(50, lim=4000)

In [None]:
plot_systems(50, lim=500, incl_only=['n3_fwd', 'jena', 'rdf4j'])

In [None]:
plot_systems(100, lim=1000, incl_only=['n3_fwd', 'jena', 'rdf4j'])