In [1]:
# numpy and pandas for data manipulation
import sys
import os
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import networkx as nx

import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import cPickle as pickle
import uuid

warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
%matplotlib inline

ROOT = '/Users/bede01/Documents/work/phd-papers/ml-workload-optimization/'
ROOT_PACKAGE_DIRECTORY = '/Users/bede01/Documents/work/phd-papers/ml-workload-optimization/code/collaborative-optimizer'
root_data = ROOT + '/data'
import hashlib

sys.path.append(ROOT_PACKAGE_DIRECTORY)
from experiment_graph.execution_environment import ExecutionEnvironment
from experiment_graph.executor import CollaborativeExecutor
from experiment_graph.workload import Workload
from experiment_graph.data_storage import StorageManagerFactory
from experiment_graph.graph.node import *
from experiment_graph.optimizations.Reuse import LinearTimeReuse
from experiment_graph.optimizations.Reuse import HelixReuse

import random

DATABASE_PATH = root_data + '/experiment_graphs/home-credit-default-risk/materialized-no-groupby'
N_ESTIMATOR = 100
from experiment_graph.optimizations.Reuse import HelixReuse
from experiment_graph.optimizations.Reuse import LinearTimeReuse
reuse_type=HelixReuse.NAME

def plot_graph(graph, figsize=(20,10),prog='dot', vertex_labels=True, edge_labels = True):
    from networkx.drawing.nx_agraph import graphviz_layout
    f = plt.figure(figsize=figsize)
    ax = f.add_subplot(1, 1, 1)
    pos = graphviz_layout(graph, prog=prog, args='')
    #pos = nx.drawing.layout.spring_layout(graph)
    jet = plt.get_cmap('gist_rainbow')
    nx.draw_networkx(
        graph,
        cmap=jet,
        # vmin=0,
        # vmax=len(unique_types),
        node_shape='s',
        pos=pos,
        with_labels=False,
        ax=ax)
    if vertex_labels:
        labels = {n_id:'{},{}'.format(n_id[:10],node.values()) for n_id, node in graph.nodes(data=True)}
        nx.draw_networkx_labels(graph,pos=pos,labels=labels)
    if edge_labels:
        nx.draw_networkx_edge_labels(
            graph,
            pos=pos,
            edge_labels={(u, v): d for u, v, d in graph.edges(data=True)})

In [2]:
DEFAULT_ROOT = '/Users/bede01/Documents/work/phd-papers/ml-workload-optimization/data/experiment_graphs/kaggle_home_credit'
graph_loc = 'introduction_to_manual_feature_engineering/sa_16/graph'

In [3]:
import cPickle as pickle
with open(DEFAULT_ROOT + '/' + graph_loc, 'rb') as g_input:
        graph = pickle.load(g_input)

In [37]:
in_degree = pd.DataFrame([(n,d) for n, d in graph.in_degree()], columns=['node', 'in_degree'])
out_degree = pd.DataFrame([(n,d) for n, d in graph.out_degree()], columns=['node', 'out_degree'])
MAT_CHANCE = 0.05
COMPUTE_COST_DIST = range(1, 100)
LOAD_COST_DIST = range(1, 40)
WORKLOAD_SIZE_DIST = range(500, 5000)
GRAPH_STATS =  in_degree.merge(out_degree, on='node')  
NUMBER_OF_WORKLOADS = 100


class Mock_object:
    def __init__(self):
        self.computed = False
    
random.seed(42)
def extend_graph(n, ee, leaves):
    for _ in range(n):
        r = random.randint(0,len(leaves)-1)   
        current_leaf = leaves.pop(r)
        
        k = random.choice(GRAPH_STATS['out_degree'])
        
        if k == 0:
            k+=1
            
        for _ in range(k):
            compute_cost = random.choice(COMPUTE_COST_DIST)
            load_cost = random.choice(LOAD_COST_DIST)
            if load_cost < compute_cost:
                mat = random.uniform(0,1) < MAT_CHANCE
            else:
                mat = False
                
            node_id = uuid.uuid4().hex.upper()[0:16]
            ee.workload_dag.graph.add_node(node_id, **{'type': 'Dataset', 'mat': mat, 'compute_cost':compute_cost, 'load_cost':load_cost, 'data':Mock_object()})
            ee.workload_dag.graph.add_edge(current_leaf, node_id)
            leaves.append(node_id)
        
        k = random.choice(GRAPH_STATS['in_degree'])
        if k > 1 and k <= len(leaves):
            compute_cost = random.choice(COMPUTE_COST_DIST)
            load_cost = random.choice(LOAD_COST_DIST)
            if load_cost < compute_cost:
                mat = random.uniform(0,1) < MAT_CHANCE
            else:
                mat = False
            node_id = uuid.uuid4().hex.upper()[0:16]
            ee.workload_dag.graph.add_node(node_id, **{'type': 'Dataset', 'mat': mat, 'compute_cost':compute_cost, 'load_cost':load_cost, 'data':Mock_object()})
            
            for _ in range(k):
                r = random.randint(0,len(leaves)-1)
                join_node = leaves.pop(r)
                ee.workload_dag.graph.add_edge(join_node, node_id)
            
            leaves.append(node_id)


In [44]:
def run_end_to_end(result_file):
    storage_manager = StorageManagerFactory.get_storage('dedup')
    execution_environment = ExecutionEnvironment(storage_manager, reuse_type='linear')
    root_node = Dataset('ROOT', execution_environment)
   
    experiment_id = uuid.uuid4().hex.upper()[0:8]
    times = []
    for n in range(1, NUMBER_OF_WORKLOADS+1):
        start_generation = datetime.now()

        execution_environment.workload_dag.add_node('ROOT', **{'root': True, 'mat': True, 'type': 'Dataset', 'compute_cost':2, 'load_cost':1, 'data':Mock_object()})
        workload_size = random.choice(WORKLOAD_SIZE_DIST)
        leaves = ['ROOT']
        extend_graph(workload_size, execution_environment, leaves)
        vertex = random.choice(leaves)
        workload = execution_environment.workload_dag
        history = execution_environment.experiment_graph
        total_generation = (datetime.now() - start_generation).total_seconds()
        execution_environment.mock_update_history()
        start_linear = datetime.now()
        linear_mat ,_,_ = LinearTimeReuse().run(vertex=vertex,workload=workload,history=history, verbose=0)
        total_linear = (datetime.now() - start_linear).total_seconds()
        
        start_helix = datetime.now()
        helix_mat ,_,_ = HelixReuse().run(vertex=vertex,workload=workload,history=history, verbose=0)
        total_helix = (datetime.now() - start_helix).total_seconds()

        times.append((experiment_id, n, workload_size, total_generation, total_linear, total_helix))
        execution_environment.new_workload()
        assert linear_mat == helix_mat
        print linear_mat
        
    with open(result_file, 'a') as the_file:
        the_file.write('\n'.join('{},{},{},{},{},{}'.format(x[0],x[1],x[2],x[3],x[4], x[5]) for x in times))
        the_file.write('\n')
    
    return execution_environment
    
        

In [45]:
location = datetime.now().strftime("%Y-%m-%d/%H-%M")
result_file = '/Users/bede01/Documents/work/phd-papers/ml-workload-optimization/experiment_results/local/reuse_benchmarking/kaggle_home_credit/{}.csv'.format(location)
REP = 1

if not os.path.exists(os.path.dirname(result_file)):
    try:
        os.makedirs(os.path.dirname(result_file))
    except OSError as exc:  # Guard against race condition
        if exc.errno != errno.EEXIST:
            raise
            

In [46]:
start_experiment = datetime.now()
for _ in range(REP):
    ee = run_end_to_end(result_file=result_file)
end_experiment = datetime.now()
print('total experiment time: {}'.format((end_experiment-start_experiment).total_seconds()))

set(['26DA8FA76D844EB6', '1B1F01B0F2194D71', 'F0D3E01D9EB5419F', '4107A16DC8D2400A'])
set(['07895120361B4929', 'ROOT', '05BEF1F54DB34E7E'])
set(['C5E19A2D0806401A'])
set(['A7DEBDA8C97C4398'])
set(['05B4319FCFF6430F', 'ROOT'])
set(['5194E58095E34E21'])
set(['F3CB45F14B8A42D8', 'E8F3A2D79B2A4383'])
set(['926BA78380AF4DDC', '75C89D60E77A45F8'])
set(['594989AEFB6649EA'])
set(['A298583ACD694D7B', '25896F592BAC4B7D', '24B9B3FD253E4F3F'])
set(['DB971FED2D004F43'])
set(['00A553FCF8334C29', '6A4C62CA178A4E05', '4EEE9D2C6FDB44B2', '467890B78596440D', '6AB7669FAAC04146', '24CBA953DFE44930', '7787323A8B33443D', '9D548289BCC44334', 'D09C5C4B19884C0B'])
set(['24F1B035D8534E63', 'B81F8B676F4746F4', '2D20E52766684047'])
set(['3606B2B0BB514D43', '842047DFD6F847FF', '884861385740476F', 'C5D632DAC0ED41FA', 'DCEB2CAC5FC94428', '713F1BB4841E47BF', '9BAE1AD2C2344571', 'AFA997FB10954385'])
set(['FFAC6816D7C84D8E', 'ROOT'])
set(['1AD9F98DC2A94558', '9756662E602E415F', '688A4E05B70D4B43'])
set(['1447AF6B2D754D

In [None]:
ee.experiment_graph.plot_graph(plt, labels_for_edges=[])

In [None]:
execu