In [4]:
# Data manipulation
import pandas as pd
import numpy as np
# Evaluation of the model
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.size'] = 18
%matplotlib inline

MAX_EVALS = 5
import csv
from hyperopt import STATUS_OK
from timeit import default_timer as timer



In [5]:
import json
import shlex
import subprocess
import logging
from pprint import pprint
import re
import os


In [6]:
def runthebenchmark(hyperparameters):
    os.chdir('/home/meghaagr/project/progress/active/../')
    storeinfile(hyperparameters)
    out=subprocess.Popen(["python3","read_config_general.py","-n 8","-c 400 400 800 4 4 4 1"], shell=False, stdout=subprocess.PIPE)
    logging.basicConfig(level=logging.DEBUG)
    output=out.stdout.read().decode('utf-8')
    print(output)
    if len(output.split(" ")) > 5:
        value = output.split(" ")[5]
        value = float(value)
        printt(value)
        return float(1/value)
    return 0;
    #logging.basicConfig(filename='app.log', filemode='a', format='%(name)s - %(levelname)s - %(message)s',level=logging.DEBUG)


In [7]:
def storeinfile(hyperparameters):
    data={"mpi": {"romio_ds_read": "enable", "romio_ds_write": "disable"}, "lfs": {"setstripe": {"size": "16777216", "count": 4}}}
    data["lfs"]["setstripe"]["size"] = int(hyperparameters["setstripe-size"])
    data["lfs"]["setstripe"]["count"] = int(hyperparameters["setstripe-count"])

    data["mpi"]["romio_ds_read"] = hyperparameters["romio_ds_read"]
    data["mpi"]["romio_ds_write"] = hyperparameters["romio_ds_write"]
    data["mpi"]["romio_cb_read"] = hyperparameters["romio_cb_read"]
    data["mpi"]["romio_cb_write"] = hyperparameters["romio_cb_write"]
    data["mpi"]["cb_buffer_size"] = str(int(hyperparameters["cb_buffer_size"]))
    with open("confex.json","w") as fp:
        json.dump(data,fp)
    print(data)


In [8]:
def objective(hyperparameters):
    global ITERATION
    ITERATION += 1
    start = timer()
    result=0
    while(result == 0):
        result = runthebenchmark(hyperparameters)
    run_time = timer() - start
    print({'loss': result, 'hyperparameters': hyperparameters, 'iteration': ITERATION, 'iteration_time': run_time, 'status': STATUS_OK})
    # Write to the csv file ('a' means append)
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([float(result), hyperparameters, ITERATION, run_time])
    return {'loss': float(result), 'hyperparameters': hyperparameters, 'iteration': ITERATION, 'iteration_time': run_time, 'status': STATUS_OK}


In [9]:
from hyperopt import hp
from hyperopt.pyll.stochastic import sample

space = {
    'romio_ds_read' : hp.choice('romio_ds_read',['enable','disable']),
    'romio_ds_write' : hp.choice('romio_ds_write',['enable','disable']),
    'romio_cb_read' : hp.choice('romio_cb_read',['enable','disable']),
    'romio_cb_write' : hp.choice('romio_cb_write',['enable','disable']),
    'cb_buffer_size' : 1048576*hp.quniform('cb_buffer_size',1,512,1),
    'setstripe-size' : 65536*(hp.qloguniform('setstripe-size',0,3,1)),
    'setstripe-count' : hp.quniform('setstripe-count',0,20,1)
    
}

In [10]:
x = sample(space)
params = x
print(x)
cb_buffer_size_dist = []
for _ in range(30):
    cb_buffer_size_dist.append(sample(space)['cb_buffer_size'])

{'cb_buffer_size': 445644800.0, 'romio_cb_read': 'disable', 'romio_cb_write': 'disable', 'romio_ds_read': 'disable', 'romio_ds_write': 'disable', 'setstripe-count': 14.0, 'setstripe-size': 196608.0}


In [11]:
from hyperopt import tpe

# Create the algorithm
tpe_algorithm = tpe.suggest


from hyperopt import Trials

# Record results
bayes_trials = Trials()

In [12]:
# File to save first results
print(os.getcwd())
out_file = '/home/meghaagr/project/progress/active/result/gbm_trials.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)
writer.writerow(['loss', 'params', 'iteration', 'train_time'])
of_connection.close()


/home/meghaagr/project/progress/active


In [13]:
from hyperopt import fmin


ITERATION = 0

best = fmin(fn = objective, space = space, algo = tpe.suggest, trials = bayes_trials, max_evals = MAX_EVALS)

{'mpi': {'romio_ds_read': 'disable', 'romio_ds_write': 'enable', 'romio_cb_read': 'disable', 'romio_cb_write': 'disable', 'cb_buffer_size': '447741952'}, 'lfs': {'setstripe': {'size': 65536, 'count': 17}}}
S3D-IO -400-400-800-4-4-4-1 691.11 15.26 22.61 1178.27 45.78 39.78 0.07 0.60 65536 17 disable enable disable disable 447741952 

  0%|          | 0/5 [01:29<?, ?it/s, best loss: ?]

INFO:hyperopt.fmin:job exception: name 'printt' is not defined





NameError: name 'printt' is not defined

In [None]:
print(best)
d=  best
print(bayes_trials.results)
bayes_trials_results = sorted(bayes_trials.results, key = lambda x: x['loss'])
bayes_trials_results[:1]

In [None]:
results = pd.read_csv(out_file)

# Sort with best scores on top and reset index for slicing
results.sort_values('train_time', ascending = True, inplace = True)
results.reset_index(inplace = True, drop = True)
results.head()


In [None]:
import ast

# Convert from a string to a dictionary
ast.literal_eval(results.loc[0, 'params'])
best_bayes_params = ast.literal_eval(results.loc[0, 'params']).copy()
print(best_bayes_params)

In [None]:
bayes_params = pd.DataFrame(columns = list(ast.literal_eval(results.loc[0, 'params']).keys()),
                            index = list(range(len(results))))

# Add the results with each parameter a different column
for i, params in enumerate(results['params']):
    bayes_params.loc[i, :] = list(ast.literal_eval(params).values())
bayes_params['train_time'] = results['train_time']    
bayes_params['loss'] = results['loss']
bayes_params['iteration'] = results['iteration']

bayes_params.head()

In [None]:
plt.figure(figsize = (20, 8))
plt.rcParams['font.size'] = 18

# Density plots of the learning rate distributions 
sns.kdeplot(bayes_params['loss'], label = 'Loss Variation', linewidth = 2)
plt.legend()
plt.xlabel('Loss'); plt.ylabel('Density'); plt.title('Loss Distribution');


In [None]:
plt.figure(figsize = (20, 8))
plt.rcParams['font.size'] = 18
cb_buffer_size_dist_now = []
for _ in range(30):
    cb_buffer_size_dist_now.append(sample(space)['cb_buffer_size'])
# Density plots of the learning rate distributions 
sns.kdeplot(bayes_params['cb_buffer_size'], label = 'cb_buffer_size', linewidth = 2)
sns.kdeplot(cb_buffer_size_dist,color="red",linewidth=2)
sns.kdeplot(cb_buffer_size_dist_now,color="green",linewidth=2)
plt.legend()
plt.xlabel('cb_buffer_size'); plt.ylabel('Density'); plt.title('cb_buffer_size Distribution');


In [None]:
import random
import numpy as np

def random_objective(hyperparameters):
    global ITERATION
    ITERATION += 1
    start = timer()
    result = runthebenchmark(hyperparameters)
    run_time = timer() - start
    print({'loss': result, 'hyperparameters': hyperparameters, 'iteration': ITERATION, 'iteration_time': run_time, 'status': STATUS_OK})
    # Write to the csv file ('a' means append)
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([float(result), hyperparameters, ITERATION, run_time])
    return [ float(result), hyperparameters, ITERATION, run_time]


In [None]:

random_space = {
    'romio_ds_read' : ['enable','disable'],
    'romio_ds_write' : ['enable','disable'],
    'romio_cb_read' : ['enable','disable'],
    'romio_cb_write' : ['enable','disable'],
    'cb_buffer_size' : [1048576*x for x in list(range(0,1024,4))],
    'setstripe-size' : [65536*x for x in list(range(0,3,1))],
    'setstripe-count' : list(range(0,20,1))
    
}
random.seed(50)

random_results = pd.DataFrame(columns = ['loss', 'params', 'iteration', 'time'],
                       index = list(range(MAX_EVALS)))

# Iterate through the specified number of evaluations
for i in range(MAX_EVALS):
    
    # Randomly sample parameters for gbm
    params = {key: random.sample(value, 1)[0] for key, value in random_space.items()}
    print(params)
    results_list = random_objective(params)
    random_results.loc[i, :] = results_list


In [None]:
random_results.sort_values('loss', ascending = True, inplace = True)
random_results.reset_index(inplace = True, drop = True)
random_results.head()


In [None]:
best_random_params = random_results.loc[0, 'params'].copy()
random_params = pd.DataFrame(columns = list(random_results.loc[0, 'params'].keys()),
                            index = list(range(len(random_results))))

# Add the results with each parameter a different column
random_results = random_results[:20]
print(len(random_results))
for i, params in enumerate(random_results['params']):
    random_params.loc[i, :] = list(params.values())
random_params['loss'] = random_results['loss']
random_params['iteration'] = random_results['iteration']
random_params['time'] = random_results['time']

random_params.head()

In [None]:
plt.figure(figsize = (20, 8))
plt.rcParams['font.size'] = 18

# Density plots of the learning rate distributions 
sns.distplot(random_params['time'], label = 'Random Search')#, linewidth = 2)
#sns.distplot(bayes_params['train_time'], label = 'Bayes Optimization')#, linewidth = 2
plt.legend()
plt.xlabel('Loss'); plt.ylabel('Density'); plt.title('Learning Rate Distribution');