-
Notifications
You must be signed in to change notification settings - Fork 61
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fixing s3 support #364
Fixing s3 support #364
Changes from all commits
6257873
9ce8d5f
68e2794
1098644
bef3117
bf24c36
f515ca9
f6e2786
c1709ff
cfb925b
cfd6751
597defc
a8a5b37
c64a788
1e11214
5816be3
f7b2535
a8807fe
b5e06b1
73acf94
cd8c53d
7aa94a7
45f9f50
e03cfe9
f332833
65d0045
e467cd1
afd79f2
6b490a3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
triage-3.5.3 | ||
triage-3.6.2 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,8 +4,7 @@ | |
import yaml | ||
from collections import OrderedDict | ||
|
||
import boto | ||
import pandas | ||
import pandas as pd | ||
from moto import mock_s3, mock_s3_deprecated | ||
|
||
from triage.component.catwalk.storage import ( | ||
|
@@ -24,11 +23,11 @@ def __init__(self, val): | |
|
||
|
||
def test_S3Store(): | ||
import boto3 | ||
with mock_s3(): | ||
s3_conn = boto3.resource('s3') | ||
s3_conn.create_bucket(Bucket='a-bucket') | ||
store = S3Store(s3_conn.Object('a-bucket', 'a-path')) | ||
import boto3 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not a big deal, just curious if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree. I believe I introduced this pattern a while ago to attempt to fix something with mocking but am unsure if it is needed. |
||
client = boto3.client('s3') | ||
client.create_bucket(Bucket='test_bucket', ACL='public-read-write') | ||
store = S3Store(path=f"s3://test_bucket/a_path") | ||
assert not store.exists() | ||
store.write(SomeClass('val')) | ||
assert store.exists() | ||
|
@@ -71,7 +70,7 @@ def matrix_store(self): | |
('m_feature', [0.4, 0.5]), | ||
('label', [0, 1]) | ||
]) | ||
df = pandas.DataFrame.from_dict(data_dict) | ||
df = pd.DataFrame.from_dict(data_dict) | ||
metadata = { | ||
'label_name': 'label', | ||
'indices': ['entity_id'], | ||
|
@@ -85,7 +84,7 @@ def matrix_store(self): | |
tmphdf = os.path.join(tmpdir, 'df.h5') | ||
with open(tmpyaml, 'w') as outfile: | ||
yaml.dump(metadata, outfile, default_flow_style=False) | ||
df.to_csv(tmpcsv, index=False) | ||
df.to_csv(tmpcsv) | ||
df.to_hdf(tmphdf, 'matrix') | ||
csv = CSVMatrixStore(matrix_path=tmpcsv, metadata_path=tmpyaml) | ||
hdf = HDFMatrixStore(matrix_path=tmphdf, metadata_path=tmpyaml) | ||
|
@@ -105,7 +104,8 @@ def matrix_store(self): | |
assert csv.labels().to_dict() == inmemory.labels().to_dict() | ||
assert hdf.labels().to_dict() == inmemory.labels().to_dict() | ||
|
||
matrix_store = [inmemory, hdf, csv] | ||
matrix_store = [inmemory, csv, hdf] | ||
|
||
return matrix_store | ||
|
||
def test_MatrixStore_resort_columns(self): | ||
|
@@ -176,7 +176,7 @@ def test_as_of_dates_entity_index(self): | |
'feature_two': [0.5, 0.6], | ||
} | ||
matrix = InMemoryMatrixStore( | ||
matrix=pandas.DataFrame.from_dict(data), | ||
matrix=pd.DataFrame.from_dict(data), | ||
metadata={'end_time': '2016-01-01', 'indices': ['entity_id']} | ||
) | ||
|
||
|
@@ -190,29 +190,25 @@ def test_as_of_dates_entity_date_index(self): | |
'as_of_date': ['2016-01-01', '2016-01-01', '2017-01-01', '2017-01-01'] | ||
} | ||
matrix = InMemoryMatrixStore( | ||
matrix=pandas.DataFrame.from_dict(data), | ||
matrix=pd.DataFrame.from_dict(data), | ||
metadata={'indices': ['entity_id', 'as_of_date']} | ||
) | ||
|
||
self.assertEqual(matrix.as_of_dates, ['2016-01-01', '2017-01-01']) | ||
|
||
def test_s3_save(self): | ||
with mock_s3_deprecated(): | ||
s3_conn = boto.connect_s3() | ||
bucket_name = 'fake-matrix-bucket' | ||
s3_conn.create_bucket(bucket_name) | ||
with mock_s3(): | ||
import boto3 | ||
client = boto3.client('s3') | ||
client.create_bucket(Bucket='fake-matrix-bucket', ACL='public-read-write') | ||
|
||
matrix_store_list = self.matrix_store() | ||
|
||
for matrix_store in matrix_store_list: | ||
matrix_store.save(project_path='s3://fake-matrix-bucket', name='test') | ||
|
||
# HDF | ||
hdf = HDFMatrixStore(matrix_path='s3://fake-matrix-bucket/test.h5', metadata_path='s3://fake-matrix-bucket/test.yaml') | ||
# CSV | ||
csv = CSVMatrixStore(matrix_path='s3://fake-matrix-bucket/test.csv', metadata_path='s3://fake-matrix-bucket/test.yaml') | ||
if isinstance(matrix_store, CSVMatrixStore): | ||
matrix_store.save(project_path='s3://fake-matrix-bucket', name='test') | ||
# CSV | ||
csv = CSVMatrixStore(matrix_path='s3://fake-matrix-bucket/test.csv', metadata_path='s3://fake-matrix-bucket/test.yaml') | ||
|
||
assert csv.metadata == matrix_store_list[0].metadata | ||
assert csv.matrix.to_dict() == matrix_store_list[0].matrix.to_dict() | ||
assert hdf.metadata == matrix_store_list[0].metadata | ||
assert hdf.matrix.to_dict() == matrix_store_list[0].matrix.to_dict() | ||
assert csv.metadata == matrix_store_list[0].metadata | ||
assert csv.matrix.to_dict() == matrix_store_list[0].matrix.to_dict() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So have we removed testing of non-csv stores? Or is it that you've added assertions which can only apply to csv? (And we can't apply these to, say, HDF?) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not testing HDF We should move to Arrow or Parquet or at least compressed CSV There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We still have HDF testing. This is for S3 storage specifically. The code now disallows storage of HDF on S3 by throwing an error upfront. So I think this testing is fine for the current state of the code. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,9 @@ | |
import pandas | ||
import os | ||
|
||
import s3fs | ||
from urllib.parse import urlparse | ||
|
||
from triage.component import metta | ||
|
||
|
||
|
@@ -25,8 +28,11 @@ def validate(self): | |
|
||
def build_all_matrices(self, build_tasks): | ||
logging.info('Building %s matrices', len(build_tasks.keys())) | ||
for matrix_uuid, task_arguments in build_tasks.items(): | ||
|
||
for i, (matrix_uuid, task_arguments) in enumerate(build_tasks.items()): | ||
logging.info(f"Building matrix {matrix_uuid} ({i}/{len(build_tasks.keys())})") | ||
self.build_matrix(**task_arguments) | ||
logging.debug(f"Matrix {matrix_uuid} built") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I really don't have a firm opinion on this, now that we have the wonderful
or:
(For that matter, All that said, particularly for higher-level log messages, it's hard to argue with the elegance and simplicity of |
||
|
||
def _outer_join_query( | ||
self, | ||
|
@@ -224,9 +230,23 @@ def build_matrix( | |
matrix_directory, | ||
'{}.csv'.format(matrix_uuid) | ||
) | ||
if not self.replace and os.path.exists(matrix_filename): | ||
logging.info('Skipping %s because matrix already exists', matrix_filename) | ||
return | ||
|
||
# The output directory is local or in s3 | ||
path_parsed = urlparse(matrix_filename) | ||
scheme = path_parsed.scheme # If '' of 'file' is a regular file or 's3' | ||
|
||
if scheme in ('', 'file'): | ||
if not self.replace and os.path.exists(matrix_filename): | ||
logging.info('Skipping %s because matrix already exists', matrix_filename) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All the additional logging is great. I don't have the hands-on experience with debugging an experiment, and so I can't say how "spammy" these might seem outside of the DEBUG context; rather, just curious if they might be. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We did decide a while ago that we would err the side of being too spammy, at least for now. |
||
return | ||
elif scheme == 's3': | ||
if not self.replace and s3fs.S3FileSystem().exists(matrix_filename): | ||
logging.info('Skipping %s because matrix already exists', matrix_filename) | ||
return | ||
else: | ||
raise ValueError(f"""URL scheme not supported: | ||
{scheme} (from {matrix_filename}) | ||
""") | ||
|
||
logging.info('Creating matrix %s > %s', matrix_metadata['matrix_id'], matrix_filename) | ||
# make the entity time table and query the labels and features tables | ||
|
@@ -248,8 +268,9 @@ def build_matrix( | |
entity_date_table_name, | ||
matrix_uuid | ||
) | ||
logging.info(f"Feature data extracted for matrix {matrix_uuid}") | ||
try: | ||
logging.info('Extracting label data frmo database into file for ' | ||
logging.info('Extracting label data from database into file for ' | ||
'matrix %s', matrix_uuid) | ||
labels_csv_name = self.write_labels_data( | ||
label_name, | ||
|
@@ -260,13 +281,15 @@ def build_matrix( | |
) | ||
features_csv_names.insert(0, labels_csv_name) | ||
|
||
logging.info(f"Label data extracted for matrix {matrix_uuid}") | ||
# stitch together the csvs | ||
logging.info('Merging feature files for matrix %s', matrix_uuid) | ||
output = self.merge_feature_csvs( | ||
features_csv_names, | ||
matrix_directory, | ||
matrix_uuid | ||
) | ||
logging.info(f"Features data merged for matrix {matrix_uuid}") | ||
finally: | ||
# clean up files and database before finishing | ||
for csv_name in features_csv_names: | ||
|
@@ -281,6 +304,7 @@ def build_matrix( | |
directory=self.matrix_directory, | ||
format='csv' | ||
) | ||
logging.info("Matrix {matrix_uuid} archived (using metta)") | ||
finally: | ||
if isinstance(output, str): | ||
os.remove(output) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we still need
smart_open
or can we remove it?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can remove it