In [1]:
import helper

In [2]:
import pandas as pd

# Automatic Partitioner Selection

In [3]:
cols = ['graph',  'partitioner',  'partitioning_time','num_partitions', '1dd', '1ds', '2d', '2ps','crvc', 'dbh', 'hdrf', 'hep1', 'hep10', 'hep100', 'ne', 'density', 'num_vertices', 'num_edges','num_connected_triples', 'pearson_moment_degrees','pearson_median_degrees', 'pearson_mode_degrees','pearson_moment_degrees_in', 'pearson_median_degrees_in','pearson_mode_degrees_in', 'pearson_moment_degrees_out','pearson_median_degrees_out', 'pearson_mode_degrees_out', 'max_degree','max_degree_in', 'max_degree_out', 'mean_degree', 'mean_degree_in','mean_degree_out', 'pearson_moment_triangles','pearson_median_triangles', 'pearson_mode_triangles', 'num_triangles','max_triangles', 'mean_triangles', 'pearson_moment_lcc','pearson_median_lcc', 'pearson_mode_lcc', 'max_lcc', 'average_lcc','global_cc', 'num_edges_undirected_representation','average_lcc_triangle_based', 'max_lcc_triangle_based']    
partitioning_df = pd.read_csv("../datasets/graph-partitioning-run-time_test.csv")[cols] 
processing_df = pd.read_csv("../datasets/graph-processing-run-time_test.csv")[["graph", "processing_time", "partitioner", "algorithm"]+['edge_balance', 'source_balance', 'vertex_balance', 'replication_factor', 'destination_balance']]
test = pd.merge(partitioning_df, processing_df, on=["graph", "partitioner"])

In [4]:
all_partitioners = [ '1dd', '1ds', '2d', '2ps','crvc', 'dbh', 'hdrf', 'hep1', 'hep10', 'hep100', 'ne']
overall_results = []
for algorithm in test.algorithm.unique():
    predicted_data = helper.glearner.predict(test[test.algorithm == algorithm], algorithm=algorithm)
    results = helper.glearner.evaluate(data=predicted_data, partitioners=all_partitioners)
    overall_results += results

final_evaluation_all = pd.DataFrame(overall_results).sort_values(by=["algorithm", "goal", "selected/best" ], ascending=False)


## Comparision of Selection Strategies (Table 8a)

In [5]:
overview_by_algorithm = final_evaluation_all.groupby(["goal", "algorithm"], as_index=False).mean()[[
    "goal","algorithm", 
    "selected/best", 
    "selected/mean", 
    "selected/worst",
    "lowest_rep_time/best",
    "selected/lowest_rep_time"]]
overview_by_algorithm=overview_by_algorithm.sort_values(by=["goal", "selected/best"])

overview_by_algorithm = overview_by_algorithm.replace({"goal": {'end2end':'E2E', 'processing': 'Pro.' } })
overview_by_algorithm = overview_by_algorithm.replace({"algorithm": {
    'sssp1':'SSSP',
    'cc':'CC',
    'pr':'PR',
    'kcoreavg':'K-Cores',
    'synthetic1c0':'Syn-Low',
    'synthetic10c0':'Syn-High',
     }
     })
overview_by_algorithm = overview_by_algorithm.round(2)
overview_by_algorithm = overview_by_algorithm.rename(columns={
    "algorithm": "Algorithm",
    "goal": "Goal",
    "selected/best":"$S_{PS}/S_B$",
    "selected/mean":"$S_{PS}/S_R$",
    "selected/worst":"$S_{PS}/S_W$",
    "selected/lowest_rep_time":"$S_{PS}/S_{LRF}$",
    "lowest_rep_time/best": "$S_{LRF}/S_{O}$",
    })
print(overview_by_algorithm[["Goal", "Algorithm", "$S_{PS}/S_B$", "$S_{PS}/S_{LRF}$", "$S_{PS}/S_R$", "$S_{PS}/S_W$", "$S_{LRF}/S_{O}$"]].to_latex(index=False, escape = False,))
overview_by_algorithm


\begin{tabular}{llrrrrr}
\toprule
 Goal & Algorithm &  $S_{PS}/S_B$ &  $S_{PS}/S_{LRF}$ &  $S_{PS}/S_R$ &  $S_{PS}/S_W$ &  $S_{LRF}/S_{O}$ \\
\midrule
  E2E &      SSSP &          1.02 &              0.69 &          0.84 &          0.67 &             1.52 \\
  E2E &        CC &          1.03 &              0.58 &          0.76 &          0.57 &             1.84 \\
  E2E &        PR &          1.10 &              0.96 &          0.96 &          0.79 &             1.19 \\
  E2E &   K-Cores &          1.11 &              0.76 &          0.91 &          0.73 &             1.52 \\
  E2E &  Syn-High &          1.13 &              0.98 &          0.92 &          0.73 &             1.19 \\
  E2E &   Syn-Low &          1.17 &              0.99 &          0.95 &          0.76 &             1.21 \\
 Pro. &      SSSP &          1.06 &              0.93 &          0.94 &          0.80 &             1.17 \\
 Pro. &        CC &          1.07 &              0.92 &          0.91 &          0.74 &      

Unnamed: 0,Goal,Algorithm,$S_{PS}/S_B$,$S_{PS}/S_R$,$S_{PS}/S_W$,$S_{LRF}/S_{O}$,$S_{PS}/S_{LRF}$
3,E2E,SSSP,1.02,0.84,0.67,1.52,0.69
0,E2E,CC,1.03,0.76,0.57,1.84,0.58
2,E2E,PR,1.1,0.96,0.79,1.19,0.96
1,E2E,K-Cores,1.11,0.91,0.73,1.52,0.76
4,E2E,Syn-High,1.13,0.92,0.73,1.19,0.98
5,E2E,Syn-Low,1.17,0.95,0.76,1.21,0.99
9,Pro.,SSSP,1.06,0.94,0.8,1.17,0.93
6,Pro.,CC,1.07,0.91,0.74,1.21,0.92
8,Pro.,PR,1.11,0.96,0.79,1.16,0.99
7,Pro.,K-Cores,1.13,0.97,0.8,1.23,0.94


## Enrichment (Table 8b)

In [6]:
all_partitioners = [ '1dd', 
       '1ds', 
       '2d', 
       '2ps',
       'crvc', 
       'dbh', 
       'hdrf', 
       'hep1', 
       'hep10', 
       'hep100', 
       'ne']
spark_partitioner = [
    '1ds', 
       '2d', 
       'crvc',
       ]
overall_results_no_enrichment = []
overall_results_enrichment = []
for algorithm in test.algorithm.unique():
    predicted_data_no_enrichment = helper.glearner_no_enrichment.predict(test[test.algorithm == algorithm], algorithm=algorithm)
    predicted_data_enrichment = helper.glearner_enrichment.predict(test[test.algorithm == algorithm], algorithm=algorithm)

    results_no_enrichment = helper.glearner_no_enrichment.evaluate(data=predicted_data_no_enrichment, partitioners=all_partitioners)
    results_enrichment = helper.glearner_enrichment.evaluate(data=predicted_data_enrichment, partitioners=all_partitioners)

    overall_results_no_enrichment += results_no_enrichment
    overall_results_enrichment += results_enrichment


final_evaluation_all_no_enrichment = pd.DataFrame(overall_results_no_enrichment).sort_values(by=["algorithm", "goal", "selected/best" ], ascending=False)
final_evaluation_all_enrichment = pd.DataFrame(overall_results_enrichment).sort_values(by=["algorithm", "goal", "selected/best" ], ascending=False)


In [7]:
all_graphs = final_evaluation_all_enrichment.graph.unique()
wiki_webgraphs = ["/home/ubuntu/cephstorage/graphs/ml-processing-real-world/enwiki-2021"]

In [8]:
final_evaluation_all_no_enrichment[final_evaluation_all_no_enrichment.graph.isin(wiki_webgraphs)].groupby(["goal"], as_index=False).mean()[["goal",
    "selected/best", 
    "selected/mean", 
    "selected/worst",
]]

Unnamed: 0,goal,selected/best,selected/mean,selected/worst
0,end2end,1.134438,0.940241,0.777155
1,processing,1.108375,0.91063,0.734759


In [9]:
final_evaluation_all_enrichment[final_evaluation_all_enrichment.graph.isin(wiki_webgraphs)].groupby(["goal"], as_index=False).mean()[["goal",
    "selected/best", 
    "selected/mean",  
    "selected/worst",
]]

Unnamed: 0,goal,selected/best,selected/mean,selected/worst
0,end2end,1.073898,0.89449,0.742288
1,processing,1.063402,0.876932,0.70979


In [10]:
final_evaluation_all_no_enrichment[final_evaluation_all_no_enrichment.graph.isin(all_graphs)].groupby(["goal"], as_index=False).mean()[["goal",
    "selected/best", 
    "selected/mean",  
    "selected/worst",
]]

Unnamed: 0,goal,selected/best,selected/mean,selected/worst
0,end2end,1.098238,0.892433,0.711068
1,processing,1.136633,0.95688,0.778421


In [11]:
final_evaluation_all_enrichment[final_evaluation_all_enrichment.graph.isin(all_graphs)].groupby(["goal"], as_index=False).mean()[["goal",
    "selected/best", 
    "selected/mean", 
    "selected/worst",
]]

Unnamed: 0,goal,selected/best,selected/mean,selected/worst
0,end2end,1.118633,0.908561,0.724282
1,processing,1.165756,0.982449,0.800527


## Feature Importance (Table 7)

In [12]:
helper.get_importance(helper.glearner_importance)

\begin{tabular}{lrrrrr}
\toprule
       feature &  replication\_factor &  vertex\_balance &  destination\_balance &  source\_balance &  edge\_balance \\
\midrule
   Partitioner &               0.299 &           0.268 &                0.245 &           0.542 &         0.244 \\
   Mean Degree &               0.274 &           0.065 &                0.059 &           0.058 &         0.036 \\
   \#Partitions &               0.256 &           0.271 &                0.293 &           0.177 &         0.472 \\
 Degree Distr. &               0.165 &           0.368 &                0.372 &           0.214 &         0.214 \\
       Density &               0.007 &           0.028 &                0.031 &           0.009 &         0.034 \\
\bottomrule
\end{tabular}

