Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmark repeat parameter and results pushed to s3 #406

Merged
merged 10 commits into from
Aug 23, 2019
107 changes: 69 additions & 38 deletions benchmarking/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
- TIMEOUT : this timeout defined the time to wait for the result of a run in seconds. Default is 1200 (20min).

"""

import boto3
import copy
import json
import logging
import time
Expand Down Expand Up @@ -49,6 +50,10 @@ def load_experiments(filepath):
with open(filepath, 'rt') as f:
experiments = json.load(f)

for experiment in experiments:
if 'repetition' not in experiment:
experiment['repetition'] = 1

jsonschema.validate(experiments, experiment_schema)

return experiments
Expand Down Expand Up @@ -230,46 +235,71 @@ def run_experiments(config):

results = {'experiments': []}
for experiment in config['experiments']:
try:
threshold = experiment['threshold']
logger.info('running experiment: {}'.format(experiment))
size_a, size_b = get_exp_sizes(experiment)
# create project
credentials = rest_client.project_create(server, config['schema'], 'mapping', "benchy_{}".format(experiment))
try:
# upload clks
upload_binary_clks(config, size_a, size_b, credentials)
# create run
run = rest_client.run_create(server, credentials['project_id'], credentials['result_token'],
threshold,
"{}_{}".format(experiment, threshold))
# wait for result
run_id = run['run_id']
logger.info(f'waiting for run {run_id} to finish')
status = rest_client.wait_for_run(server, credentials['project_id'], run['run_id'],
credentials['result_token'], timeout=config['timeout'])
if status['state'] != 'completed':
raise RuntimeError('run did not finish!\n{}'.format(status))
logger.info('experiment successful. Evaluating results now...')
mapping = rest_client.run_get_result_text(server, credentials['project_id'], run['run_id'],
credentials['result_token'])
mapping = json.loads(mapping)['mapping']
mapping = {int(k): int(v) for k, v in mapping.items()}
tt = score_mapping(mapping, *load_truth(config, size_a, size_b))
result = compose_result(status, tt, experiment, (size_a, size_b), threshold)
results['experiments'].append(result)
logger.info('cleaning up...')
delete_resources(config, credentials, run)
except Exception as e:
delete_resources(config, credentials, run)
raise e
except Exception as e:
e_trace = format_exc()
logger.warning("experiment '{}' failed: {}".format(experiment, e_trace))
results['experiments'].append({'name': experiment, 'status': 'ERROR', 'description': e_trace})
repetition = experiment['repetition']
threshold = experiment['threshold']
size_a, size_b = get_exp_sizes(experiment)

for rep in range(repetition):
current_experiment = copy.deepcopy(experiment)
current_experiment['rep'] = rep + 1
logger.info('running experiment: {}'.format(current_experiment))
if repetition != 1:
logger.info('\trepetition {} out of {}'.format(rep + 1, repetition))
result = run_single_experiment(server, config, threshold, size_a, size_b, current_experiment)
results['experiments'].append(result)

return results


def run_single_experiment(server, config, threshold, size_a, size_b, experiment):
result = {}
credentials = {}
run = {}
try:
credentials = rest_client.project_create(server, config['schema'], 'mapping',
"benchy_{}".format(experiment))
# upload clks
upload_binary_clks(config, size_a, size_b, credentials)
# create run
run = rest_client.run_create(server, credentials['project_id'], credentials['result_token'],
threshold,
"{}_{}".format(experiment, threshold))
# wait for result
run_id = run['run_id']
result['run_id'] = run_id
hardbyte marked this conversation as resolved.
Show resolved Hide resolved
logger.info(f'waiting for run {run_id} to finish')
status = rest_client.wait_for_run(server, credentials['project_id'], run['run_id'],
credentials['result_token'], timeout=config['timeout'])
if status['state'] != 'completed':
raise RuntimeError('run did not finish!\n{}'.format(status))
logger.info('experiment successful. Evaluating results now...')
mapping = rest_client.run_get_result_text(server, credentials['project_id'], run['run_id'],
credentials['result_token'])
mapping = json.loads(mapping)['mapping']
mapping = {int(k): int(v) for k, v in mapping.items()}
tt = score_mapping(mapping, *load_truth(config, size_a, size_b))
result.update(compose_result(status, tt, experiment, (size_a, size_b), threshold))
except Exception as e:
e_trace = format_exc()
logger.warning("experiment '{}' failed: {}".format(experiment, e_trace))
result.update({'name': experiment, 'status': 'ERROR', 'description': e_trace})
finally:
logger.info('cleaning up...')
delete_resources(config, credentials, run)

return result


def push_result_s3(experiment_file):
client = boto3.client(
's3',
aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My hunch is we should be agnostic as multiple services offer the S3 api (including MINIO which we already use in this project).

How about OBJECT_STORE_SERVER, OBJECT_STORE_ACCESS_KEY, OBJECT_STORE_BUCKET etc as the environment variables? And open an issue to change them in the backend settings from MINIO_ - https://github.com/data61/anonlink-entity-service/blob/develop/backend/entityservice/settings.py#L27

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm actually not convinced. My reasoning being the following:

  • we do not support anything else than aws in the benchmark
  • the benchmark is fully independent from the main repository, so variable names from the main repo should not impact choices made for the benchmark (even if consistency may be a good idea)
  • but if we are thinking about both repos together, I would actually made them even more specific, such as AWS_BENCHMARK_... to be sure that we cannot imagine they will be used for any other context than the chosen one. I would also point out that if we implement a script doing all at once (deploying and benchmarking), it's good to use different environment variable names to be sure that a mis-configuration will not lead to misuse of a token. E.g. I create a script deploying the entity service setting the env var OBJECT_STORE_ACCESS_KEY, if the script also starts the benchmark but I forgot to update the env var, I would push the results to the same bucket. Here it is not important as we are not deleting anything but if we were, we could do really bad things...
  • would there be a scenario where a single service has multiple keys? E.g. one key for the bucket x and one key for the bucket y? The underlying question being if we prefer to have keys per application, or keys per use-case (the application can do x, y and z, or key a can do x, key b can do y and key c can do z, and I give the application the keys a, b and c). Both have pros and cons, but we may want to think about it
  • if we generalise to OBJECT_STORE_SERVER, I think we will also need an extra field OBJECT_STORE_TYPE, in which case we may also need to modify the description of the env variables: one object store type may not use a bucket and access key, but something totally different (I have no clue what it could be, but maybe :) ). This field would also help to know which service to use.

But the more options we are adding, the more I would assume we should push them into a command line tool instead of env vars.

While not being convinced, I'm also not really strongly opinionated on this, so happy to change if that's the preference :)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not too fussed with the variable names but it would be nice to include a way to set the object store server so the benchmark user can decide for themselves.

See the docs at https://docs.min.io/docs/minio-select-api-quickstart-guide.html it looks like all you would need is to add an endpoint_url which is only set if an environment variable is present e.g. S3_SERVER or my preference OBJECT_STORE_SERVER.

s3 = boto3.client('s3',
                  endpoint_url='http://localhost:9000',
                  aws_access_key_id='minio',
                  aws_secret_access_key='minio123',
                  )

I don't think we need to tell the benchmark component the TYPE of object store - that would defeat the purpose of an abstraction - in this case the S3 API.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OBJECT_STORE_SERVER added, and other ones renamed to: OBJECT_STORE_ACCESS_KEY, OBJECT_STORE_SECRET_KEY and OBJECT_STORE_BUCKET

aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY')
)
s3_bucket = "anonlink-benchmark-result"
client.upload_file(experiment_file, s3_bucket, "results.json")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So if this benchmarking job is run multiple times in k8s with the same s3 bucket (extremely likely) the results get overridden. I suggest we include a timestamp or uuid in the uploaded filename.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍
I will use timestamps, to more easily access them after (the job may not be kept on k8s, so the UUID may become hard to access).



def main():
config = read_config()
server_status = rest_client.server_get_status(config['server'])
Expand All @@ -293,6 +323,7 @@ def main():
pprint(results)
with open(config['results_path'], 'wt') as f:
json.dump(results, f)
push_result_s3(config['results_path'])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should uploading to s3 be optional? Someone might want to run this benchmarking locally and just see the output as before?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍
I'll just check if the environment variables have been set, not adding an extra one.



if __name__ == '__main__':
Expand Down
5 changes: 3 additions & 2 deletions benchmarking/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
arrow
boto3
clkhash==0.14.0
jsonschema
numpy
pandas
arrow
requests
jsonschema
13 changes: 12 additions & 1 deletion benchmarking/schema/experiments.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,18 @@
"examples": [
0.80, 0.9, 0.95
]
},
"repetition": {
"$id": "#/items/properties/repetition",
"type": "number",
"title": "Number of times this experiment is repeated",
"default": 1,
"minimum": 1,
"multipleOf": 1,
"examples": [
1, 2, 10, 100
]
}
}
}
}
}
18 changes: 14 additions & 4 deletions deployment/jobs/benchmark/timing-test-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,30 @@ spec:
mountPath: /cache
containers:
- name: entitytester
image: quay.io/n1analytics/entity-benchmark:v0.2.0
image: data61/anonlink-benchmark:v0.3.0-dev
env:
- name: SERVER
value: "https://testing.es.data61.xyz"
value: "https://benchmark.es.data61.xyz"
- name: TIMEOUT
value: "1200"
value: "2400"
- name: EXPERIMENT
value: "/config/experiments.json"
- name: DATA_PATH
value: "/cache"
- name: SCHEMA
value: "/cache/schema.json"
- name: RESULTS_PATH
value: "/app/results.json"
value: "/tmp/results.json"
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: anonlink-benchmark-aws-credentials
key: AWS_ACCESS_KEY_ID
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: anonlink-benchmark-aws-credentials
key: AWS_SECRET_ACCESS_KEY
volumeMounts:
- name: experiments-volume
mountPath: /config
Expand Down