In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
sys.dont_write_bytecode = True
import json
import time
from datetime import datetime
import single_node_profiles_cpp as snp
import profiler_new
import numpy as np
from optimizer_new import BruteForceOptimizer, GreedyOptimizer
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline


In [3]:
profs = snp.load_single_node_profiles()

In [4]:
profs.keys()

[u'tf-resnet-feats',
 u'tf-log-reg',
 u'tf-kernel-svm',
 u'res50',
 u'inception',
 u'res152',
 u'alexnet']

In [5]:
dag = profiler_new.get_logical_pipeline("pipeline_one")
with open(os.path.abspath("../results/e2e_profs_new_metrics/incep_1-logreg_1-ksvm_1-resnet_1-180312_012713.json")) as f:
    sample_run = json.load(f)
print(dag.reference_node)
scale_factors = profiler_new.get_node_scale_factors(sample_run, dag.reference_node)
node_configs = profiler_new.get_node_configs_from_experiment(sample_run)
def which_stage(model_name):
    if model_name == "tf-kernel-svm" or model_name == "tf-log-reg":
        return "latency_stage"
    else:
        return "thru_stage"
node_profs = {name : profiler_new.NodeProfile(name, profs[name], which_stage(name)) for name, _ in node_configs.items()}

inception


In [6]:
inception = profiler_new.NodeProfile("inception", profs["inception"], "thru_stage")
inception

<profiler_new.NodeProfile at 0x111d2fbd0>

In [27]:
from optimizer_new import BruteForceOptimizer, GreedyOptimizer
opt = GreedyOptimizer(dag, scale_factors, node_profs)
cloud = "aws"
initial_config = {"tf-resnet-feats": profiler_new.NodeConfig(name="tf-resnet-feats",
                                                          num_cpus=1,
                                                          gpu_type="v100",
                                                          batch_size=1,
                                                          num_replicas=1,
                                                          cloud=cloud),
                  "inception": profiler_new.NodeConfig(name="inception",
                                                      num_cpus=1,
                                                      gpu_type="v100",
                                                      batch_size=1,
                                                      num_replicas=1,
                                                      cloud=cloud),
                  "tf-log-reg": profiler_new.NodeConfig(name="tf-log-reg",
                                                      num_cpus=1,
                                                      gpu_type="none",
                                                      batch_size=1,
                                                      num_replicas=1,
                                                      cloud=cloud),
                  "tf-kernel-svm": profiler_new.NodeConfig(name="tf-kernel-svm",
                                                      num_cpus=1,
                                                      gpu_type="none",
                                                      batch_size=1,
                                                      num_replicas=1,
                                                      cloud=cloud),
                 }
# 24 ms mean inter-arrival time
arrival_cached = np.cumsum(np.random.exponential(24, size=(40000)))
opt.select_optimal_config(cloud, 0.5, 100, initial_config, arrival_cached)

('tf-resnet-feats', 10.697808138501022, {'inception': NodeConfig(inception, 1, v100, 1, 1, aws), 'tf-kernel-svm': NodeConfig(tf-kernel-svm, 1, none, 1, 1, aws), 'tf-resnet-feats': NodeConfig(tf-resnet-feats, 1, v100, 1, 1, aws), 'tf-log-reg': NodeConfig(tf-log-reg, 1, none, 1, 1, aws)})
('tf-resnet-feats', 21.395616277002045, {'inception': NodeConfig(inception, 1, v100, 1, 1, aws), 'tf-kernel-svm': NodeConfig(tf-kernel-svm, 1, none, 1, 1, aws), 'tf-resnet-feats': NodeConfig(tf-resnet-feats, 1, v100, 1, 2, aws), 'tf-log-reg': NodeConfig(tf-log-reg, 1, none, 1, 1, aws)})
Service throughput lower than arrival rate!
('tf-resnet-feats', 21.201722055179918, {'inception': NodeConfig(inception, 1, v100, 1, 1, aws), 'tf-kernel-svm': NodeConfig(tf-kernel-svm, 1, none, 1, 1, aws), 'tf-resnet-feats': NodeConfig(tf-resnet-feats, 1, v100, 2.0, 1, aws), 'tf-log-reg': NodeConfig(tf-log-reg, 1, none, 1, 1, aws)})
Service throughput lower than arrival rate!
Upgrading bottleneck node tf-resnet-feats to N

KeyboardInterrupt: 

In [None]:
import itertools
def brute_force_optimizer(dag, scale_factors, node_profs, cost_constraint, latency_constraint):
    """
        This doesn't loo
    """
    all_node_configs = [node_profs[node].enumerate_configs(max_replication_factor=3) for node in dag.nodes()]     
    all_pipeline_configs = itertools.product(*all_node_configs)
    num_valid_configs = 0
    best_config = None
    best_config_perf = None
    cur_index = 0
    for p_config in all_pipeline_configs:
        cur_index += 1
        if cur_index % 500 == 0:
            print("Processed {}".format(cur_index))
        cur_node_configs = {n.name: n for n in p_config}
        if not profiler.is_valid_pipeline_config(cur_node_configs):
            continue
        cur_config_perf = profiler.estimate_pipeline_performance_for_config(
            dag, scale_factors, cur_node_configs, node_profs)
        if cur_config_perf["latency"] <= latency_constraint and cur_config_perf["cost"] <= cost_constraint:
            if best_config is None:
                best_config = cur_node_configs
                best_config_perf = cur_config_perf
                print("Initializing config to {} ({})".format(best_config, best_config_perf))
            else:
                if cur_config_perf["throughput"] > best_config_perf["throughput"]:
                    best_config = cur_node_configs
                    best_config_perf = cur_config_perf
                    print("Updating config to {} ({})".format(best_config, best_config_perf))
        
    return best_config, best_config_perf

In [None]:
start = datetime.now()
brute_force_optimizer(dag, scale_factors, node_profs, 7.0, 0.8)
end = datetime.now()
print("{}".format((end-start).total_seconds()))

In [None]:
profiler.estimate_pipeline_performance_for_config(dag, scale_factors, node_configs, node_profs)

In [None]:
groups = profs["alexnet"].groupby(["cloud","gpu_type","num_cpus_per_replica"])
groups

In [None]:
for name, df in groups:
    print(name, df)