In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import os
import sys
import json
import time
from datetime import datetime
import single_node_profiles_cpp as snp
import profiler
import end_to_end_profiles as e2e_profs
import numpy as np
from optimizer import BruteForceOptimizer, GreedyOptimizer
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline


In [19]:
profs = snp.load_single_node_profiles()

In [None]:
profs.keys()

In [24]:
dag = profiler.get_logical_pipeline("pipeline_one")
with open(os.path.abspath("../results_python_benchmarker/e2e_profs/systemx/image_driver_1/500ms/incep_1-logreg_1-ksvm_1-resnet_1-171221_091209.json")) as f:
    sample_run = json.load(f)
scale_factors = profiler.get_node_scale_factors(sample_run, dag.reference_node)
node_configs = profiler.get_node_configs_from_experiment(sample_run)
node_profs = {}
for name, _ in node_configs.items():
    if name in ["tf-log-reg", "tf-kernel-svm"]:
        node_profs[name] = profiler.NodeProfile(name, profs[name], "latency_stage")
    else:
        node_profs[name] = profiler.NodeProfile(name, profs[name], "thru_stage")


# node_profs = {name : profiler.NodeProfile(name, profs[name]) for name, _ in node_configs.items()}

In [13]:
node_profs.keys()

dict_keys(['inception', 'tf-log-reg', 'tf-kernel-svm', 'tf-resnet-feats'])

In [26]:
opt = GreedyOptimizer(dag, scale_factors, node_profs)
cloud = "aws"
initial_config = {"inception": profiler.NodeConfig(name="inception",
                                                      num_cpus=1,
                                                      gpu_type="k80",
                                                      batch_size=1,
                                                      num_replicas=1,
                                                      cloud=cloud),
                  "tf-resnet-feats": profiler.NodeConfig(name="tf-resnet-feats",
                                                      num_cpus=1,
                                                      gpu_type="k80",
                                                      batch_size=1,
                                                      num_replicas=1,
                                                      cloud=cloud),
                  "tf-log-reg": profiler.NodeConfig(name="tf-log-reg",
                                                      num_cpus=1,
                                                      gpu_type="none",
                                                      batch_size=1,
                                                      num_replicas=1,
                                                      cloud=cloud),
                  "tf-kernel-svm": profiler.NodeConfig(name="tf-kernel-svm",
                                                      num_cpus=1,
                                                      gpu_type="none",
                                                      batch_size=1,
                                                      num_replicas=1,
                                                      cloud=cloud),
                 }
opt.select_optimal_config(cloud, 0.5, 2, initial_config)

Upgrading bottleneck node tf-resnet-feats to NodeConfig(tf-resnet-feats, 1, k80, 2.0, 1, aws)
Upgrading bottleneck node inception to NodeConfig(inception, 1, k80, 2.0, 1, aws)
Upgrading bottleneck node tf-resnet-feats to NodeConfig(tf-resnet-feats, 1, k80, 3.0, 1, aws)
Upgrading bottleneck node tf-resnet-feats to NodeConfig(tf-resnet-feats, 1, k80, 4.0, 1, aws)
Upgrading bottleneck node tf-resnet-feats to NodeConfig(tf-resnet-feats, 1, k80, 8.0, 1, aws)
Upgrading bottleneck node inception to NodeConfig(inception, 1, k80, 3.0, 1, aws)
Upgrading bottleneck node tf-resnet-feats to NodeConfig(tf-resnet-feats, 1, k80, 12.0, 1, aws)
Upgrading bottleneck node inception to NodeConfig(inception, 1, k80, 4.0, 1, aws)
Upgrading bottleneck node tf-resnet-feats to NodeConfig(tf-resnet-feats, 1, k80, 16.0, 1, aws)
Upgrading bottleneck node inception to NodeConfig(inception, 1, k80, 8.0, 1, aws)


({'inception': NodeConfig(inception, 1, k80, 8.0, 1, aws),
  'tf-kernel-svm': NodeConfig(tf-kernel-svm, 1, none, 1, 1, aws),
  'tf-log-reg': NodeConfig(tf-log-reg, 1, none, 1, 1, aws),
  'tf-resnet-feats': NodeConfig(tf-resnet-feats, 1, k80, 16.0, 1, aws)},
 {'cost': 1.534, 'latency': 0.39866122, 'throughput': 44.72536696343619})

In [None]:
node_profs["res152"].plot_profile()

In [None]:
for n, p in node_profs.items():
    p.check_monotonicity()
    r = p.plot_profile()
    break

In [None]:
b = [True, True, False]
for i, p in enumerate(b):
    print(i,p)

In [None]:
p = node_profs["alexnet"]
p.profile.iloc[7]["mean_batch_size"]

In [None]:
for bundle, _ in r:
    print("-".join([str(b) for b in bundle]))

In [None]:
opt = GreedyOptimizer(dag, scale_factors, node_profs)
cloud = "gcp"
initial_config = {"tf": profiler.NodeConfig(name="alexnet",
                                                      num_cpus=1,
                                                      gpu_type="k80",
                                                      batch_size=1,
                                                      num_replicas=1,
                                                      cloud=cloud),
                  "res50": profiler.NodeConfig(name="res50",
                                                      num_cpus=1,
                                                      gpu_type="k80",
                                                          batch_size=1,
                                                      num_replicas=1,
                                                      cloud=cloud),
                  "res152": profiler.NodeConfig(name="res152",
                                                      num_cpus=1,
                                                      gpu_type="k80",
                                                      batch_size=1,
                                                      num_replicas=1,
                                                      cloud=cloud),
                 }
opt.select_optimal_config(cloud, 0.7, 50, initial_config)

In [None]:
import itertools
def brute_force_optimizer(dag, scale_factors, node_profs, cost_constraint, latency_constraint):
    """
        This doesn't loo
    """
    all_node_configs = [node_profs[node].enumerate_configs(max_replication_factor=3) for node in dag.nodes()]     
    all_pipeline_configs = itertools.product(*all_node_configs)
    num_valid_configs = 0
    best_config = None
    best_config_perf = None
    cur_index = 0
    for p_config in all_pipeline_configs:
        cur_index += 1
        if cur_index % 500 == 0:
            print("Processed {}".format(cur_index))
        cur_node_configs = {n.name: n for n in p_config}
        if not profiler.is_valid_pipeline_config(cur_node_configs):
            continue
        cur_config_perf = profiler.estimate_pipeline_performance_for_config(
            dag, scale_factors, cur_node_configs, node_profs)
        if cur_config_perf["latency"] <= latency_constraint and cur_config_perf["cost"] <= cost_constraint:
            if best_config is None:
                best_config = cur_node_configs
                best_config_perf = cur_config_perf
                print("Initializing config to {} ({})".format(best_config, best_config_perf))
            else:
                if cur_config_perf["throughput"] > best_config_perf["throughput"]:
                    best_config = cur_node_configs
                    best_config_perf = cur_config_perf
                    print("Updating config to {} ({})".format(best_config, best_config_perf))
        
    return best_config, best_config_perf

In [None]:
start = datetime.now()
brute_force_optimizer(dag, scale_factors, node_profs, 7.0, 0.8)
end = datetime.now()
print("{}".format((end-start).total_seconds()))

In [None]:
profiler.estimate_pipeline_performance_for_config(dag, scale_factors, node_configs, node_profs)

In [None]:
groups = profs["alexnet"].groupby(["cloud","gpu_type","num_cpus_per_replica"])
groups

In [None]:
for name, df in groups:
    print(name, df)