From 89aeffba8e33e4cfb673690a1ef63c94e058ebc2 Mon Sep 17 00:00:00 2001 From: Alexander Urban Date: Wed, 3 Apr 2019 11:42:29 -0500 Subject: [PATCH 1/6] Add protection against local frame files not being found, and update the unit tests for get_data --- gwdetchar/io/datafind.py | 22 +++++++++++----------- gwdetchar/io/tests/test_datafind.py | 17 +++++++++-------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/gwdetchar/io/datafind.py b/gwdetchar/io/datafind.py index 1d8ebbd89..e9cb5a852 100644 --- a/gwdetchar/io/datafind.py +++ b/gwdetchar/io/datafind.py @@ -20,6 +20,7 @@ """ import warnings +from six.moves.urllib.error import HTTPError import gwdatafind @@ -153,7 +154,7 @@ def get_data(channel, start, end, obs=None, frametype=None, source=None, **kwargs : `dict`, optional additional keyword arguments to `~gwpy.timeseries.TimeSeries.read` - or `~gwpy.timeseries.TimeSeries.fetch` + or `~gwpy.timeseries.TimeSeries.get` Returns ------- @@ -168,7 +169,7 @@ def get_data(channel, start, end, obs=None, frametype=None, source=None, See Also -------- - gwpy.timeseries.TimeSeries.fetch + gwpy.timeseries.TimeSeries.get for the underlying method to read from an NDS server gwpy.timeseries.TimeSeries.read for the underlying method to read from a local file cache @@ -178,23 +179,22 @@ def get_data(channel, start, end, obs=None, frametype=None, source=None, series_class = TimeSeriesDict else: series_class = TimeSeries - # construct file cache if none is given - if source is None: - obs = obs if obs is not None else frametype[0] - source = gwdatafind.find_urls(obs, frametype, start, end) - # read from frames or NDS - if source: + try: # read from frame files + source = source or gwdatafind.find_urls( + obs or frametype[0], frametype, start, end) if isinstance(channel, (list, tuple)): channel = remove_missing_channels(channel, source) return series_class.read( source, channel, start=start, end=end, nproc=nproc, verbose=verbose, **kwargs) - elif isinstance(channel, str): - return series_class.fetch( + except (HTTPError, TypeError): # frame files not found + pass + if not isinstance(channel, (list, tuple)): + return series_class.get( channel, start, end, verbose=verbose, **kwargs) # if all else fails, process channels in groups of 60 data = series_class() for group in [channel[i:i + 60] for i in range(0, len(channel), 60)]: - data.append(series_class.fetch( + data.append(series_class.get( group, start, end, verbose=verbose, **kwargs)) return data diff --git a/gwdetchar/io/tests/test_datafind.py b/gwdetchar/io/tests/test_datafind.py index 03a97f89d..e34c6a112 100644 --- a/gwdetchar/io/tests/test_datafind.py +++ b/gwdetchar/io/tests/test_datafind.py @@ -23,6 +23,7 @@ import numpy from numpy import testing as nptest +from six.moves.urllib.error import HTTPError from gwpy.testing.compat import mock from gwpy.timeseries import (TimeSeries, TimeSeriesDict) @@ -65,8 +66,8 @@ def test_remove_missing_channels(io_gwf): assert channels == ['X1:TEST-STRAIN'] -@mock.patch('gwpy.timeseries.TimeSeries.fetch', return_value=HOFT) -def test_get_data_from_NDS(tsfetch): +@mock.patch('gwpy.timeseries.TimeSeries.get', return_value=HOFT) +def test_get_data_from_NDS(tsget): # retrieve data start = 0 end = 64 @@ -78,9 +79,9 @@ def test_get_data_from_NDS(tsfetch): nptest.assert_array_equal(data.value, HOFT.value) -@mock.patch('gwpy.timeseries.TimeSeriesDict.fetch', +@mock.patch('gwpy.timeseries.TimeSeriesDict.get', return_value=TimeSeriesDict({'X1:TEST-STRAIN': HOFT})) -def test_get_data_dict_from_NDS(tsdfetch): +def test_get_data_dict_from_NDS(tsdget): # retrieve data start = 33 end = 64 @@ -94,7 +95,7 @@ def test_get_data_dict_from_NDS(tsdfetch): @mock.patch('gwpy.timeseries.TimeSeries.read', return_value=HOFT.crop(16, 48)) -def test_get_data_from_cache(tsfetch): +def test_get_data_from_cache(tsget): # retrieve test frame start = 16 end = start + 32 @@ -110,9 +111,9 @@ def test_get_data_from_cache(tsfetch): @mock.patch('gwdetchar.io.datafind.remove_missing_channels') @mock.patch('gwpy.timeseries.TimeSeriesDict.read') -def test_get_data_dict_from_cache(tsdfetch, remove): +def test_get_data_dict_from_cache(tsdget, remove): # set return values - tsdfetch.return_value = TimeSeriesDict({ + tsdget.return_value = TimeSeriesDict({ 'X1:TEST-STRAIN': HOFT.crop(16, 48)}) remove.return_value = ['X1:TEST-STRAIN'] # retrieve test frame @@ -131,5 +132,5 @@ def test_get_data_dict_from_cache(tsdfetch, remove): def test_fail_on_no_frametype(): channel = 'X1:TEST-STRAIN' - with pytest.raises(TypeError): + with pytest.raises(HTTPError): datafind.get_data(channel, start=0, end=32) From 04a8dce2fba0233b1e15b681af0c984e0b53aedc Mon Sep 17 00:00:00 2001 From: Alexander Urban Date: Wed, 3 Apr 2019 14:21:25 -0500 Subject: [PATCH 2/6] Remove unneeded unit test --- gwdetchar/io/tests/test_datafind.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/gwdetchar/io/tests/test_datafind.py b/gwdetchar/io/tests/test_datafind.py index e34c6a112..e00c1c445 100644 --- a/gwdetchar/io/tests/test_datafind.py +++ b/gwdetchar/io/tests/test_datafind.py @@ -128,9 +128,3 @@ def test_get_data_dict_from_cache(tsdget, remove): assert data[channels[0]].span == Segment(start, end) nptest.assert_array_equal(data[channels[0]].value, HOFT.crop(start, end).value) - - -def test_fail_on_no_frametype(): - channel = 'X1:TEST-STRAIN' - with pytest.raises(HTTPError): - datafind.get_data(channel, start=0, end=32) From 255b5ef965d91f8d27633b07bacc33ce07afceca Mon Sep 17 00:00:00 2001 From: Alexander Urban Date: Thu, 4 Apr 2019 21:30:20 -0500 Subject: [PATCH 3/6] Handle use cases more robustly --- bin/gwdetchar-conlog | 2 +- bin/gwdetchar-lasso-correlation | 6 ++-- bin/gwdetchar-scattering | 2 +- bin/gwdetchar-slow-correlation | 6 ++-- gwdetchar/io/datafind.py | 55 ++++++++++++++++++----------- gwdetchar/io/tests/test_datafind.py | 4 +-- 6 files changed, 45 insertions(+), 30 deletions(-) diff --git a/bin/gwdetchar-conlog b/bin/gwdetchar-conlog index 76ba20320..a39b9adb8 100644 --- a/bin/gwdetchar-conlog +++ b/bin/gwdetchar-conlog @@ -45,7 +45,7 @@ parser = cli.create_parser(description=__doc__) cli.add_gps_start_stop_arguments(parser) cli.add_ifo_option(parser) cli.add_frametype_option(parser, required=const.IFO is None, - default='%s_T'.format(const.IFO)) + default='{}_T'.format(const.IFO)) cli.add_nproc_option(parser) parser.add_argument('-o', '--output', default='changes.csv', help='Path to output data file, default: %(default)s') diff --git a/bin/gwdetchar-lasso-correlation b/bin/gwdetchar-lasso-correlation index f437696fd..177a9f4b7 100644 --- a/bin/gwdetchar-lasso-correlation +++ b/bin/gwdetchar-lasso-correlation @@ -149,7 +149,7 @@ if args.band_pass: logger.info("-- Loading primary channel data") bandts = get_data( primary, start-pad, end+pad, verbose='Reading primary:'.rjust(30), - obs=args.ifo[0], frametype=args.primary_frametype, nproc=args.nproc) + frametype=args.primary_frametype, nproc=args.nproc) if flower < 0 or fupper >= float((bandts.sample_rate/2.).value): raise ValueError("bandpass frequency is out of range for this " "channel, band (Hz): {0}, sample rate: {1}".format( @@ -180,7 +180,7 @@ else: # load primary channel data logger.info("-- Loading primary channel data") primaryts = get_data(primary, start, end, frametype=args.primary_frametype, - obs=args.ifo[0], verbose='Reading:'.rjust(30), + verbose='Reading:'.rjust(30), nproc=args.nproc).crop(start, end) if args.remove_outliers: @@ -216,7 +216,7 @@ else: frametype = '%s_T' % args.ifo # for second trends auxdata = get_data( - channels, start, end, verbose='Reading:'.rjust(30), obs=args.ifo[0], + channels, start, end, verbose='Reading:'.rjust(30), frametype=frametype, nproc=args.nproc, pad=0).crop(start, end) # -- removes flat data to be re-introdused later diff --git a/bin/gwdetchar-scattering b/bin/gwdetchar-scattering index f83096b0d..9c6de1da3 100644 --- a/bin/gwdetchar-scattering +++ b/bin/gwdetchar-scattering @@ -289,7 +289,7 @@ for i, seg in enumerate(statea): ) if args.verbose else False alldata.append( get_data(allchannels, seg[0], seg[1], frametype=args.frametype, - obs=args.ifo[0], verbose=msg, nproc=args.nproc).resample(128)) + verbose=msg, nproc=args.nproc).resample(128)) scatter_segments = DataQualityDict() actives = SegmentList() diff --git a/bin/gwdetchar-slow-correlation b/bin/gwdetchar-slow-correlation index ef6875a2a..b95ee8419 100644 --- a/bin/gwdetchar-slow-correlation +++ b/bin/gwdetchar-slow-correlation @@ -112,7 +112,7 @@ rcParams.update(tex_settings) # load data logger.info("-- Loading range data") rangets = get_data(rangechannel, start, end, frametype=args.range_frametype, - obs=args.ifo[0], verbose=True, nproc=args.nproc) + verbose=True, nproc=args.nproc) if args.trend_type == 'minute': dstart, dend = rangets.span @@ -121,7 +121,7 @@ else: dend = end logger.info("-- Loading h(t) data") -darmts = get_data(primary, dstart-pad, dend+pad, verbose=True, obs=args.ifo[0], +darmts = get_data(primary, dstart-pad, dend+pad, verbose=True, frametype=args.primary_frametype, nproc=args.nproc) # get darm BLRMS @@ -185,7 +185,7 @@ if args.trend_type == 'minute': else: frametype = '%s_T' % args.ifo # for second trends auxdata = get_data(map(str, channels), dstart, dend, verbose=True, pad=0, - obs=args.ifo[0], frametype=frametype, nproc=args.nproc) + frametype=frametype, nproc=args.nproc) gpsstub = '%d-%d' % (start, end-start) re_delim = re.compile('[:_-]') diff --git a/gwdetchar/io/datafind.py b/gwdetchar/io/datafind.py index e9cb5a852..655a6a6fe 100644 --- a/gwdetchar/io/datafind.py +++ b/gwdetchar/io/datafind.py @@ -19,6 +19,7 @@ """gw_data_find wrappers """ +import re import warnings from six.moves.urllib.error import HTTPError @@ -119,7 +120,7 @@ def remove_missing_channels(channels, gwfcache): return list(keep) -def get_data(channel, start, end, obs=None, frametype=None, source=None, +def get_data(channel, start, end, frametype=None, source=None, nproc=1, verbose=False, **kwargs): """Retrieve data for given channels within a certain time range @@ -134,17 +135,12 @@ def get_data(channel, start, end, obs=None, frametype=None, source=None, end : `float` GPS end time of requested data - obs : `str`, optional - single-letter name of observatory, defaults to the first letter of - `frametype` - frametype : `str`, optional - name of frametype in which channel(s) are stored, required if `source` - is `None` + name of frametype in which channel(s) are stored, default: `None` source : `str`, `list`, optional - `str` path(s) of a LAL-format cache file or individual data file, will - supercede `frametype` if given, defaults to `None` + path(s) of a LAL-format cache file or individual data file, + default: `None` nproc : `int`, optional number of parallel processes to use, uses serial process by default @@ -167,31 +163,50 @@ def get_data(channel, start, end, obs=None, frametype=None, source=None, If `channel` is a `str`, then a `TimeSeries` object will be returned, else the result is a `TimeSeriesDict`. + The `frametype` argument should be used to read from archived frame files, + while `source` should be used to read from a local cache or specific data + file. If either fails, or if neither is passed, this function will attempt + to get data over an NDS server. + + If `frametype` is used to read from the archive, any channels missing + from the first or last frame file in the requested time range will be + ignored. + See Also -------- + remove_missing_channels + a utility that removes channels missing from the frame archive gwpy.timeseries.TimeSeries.get - for the underlying method to read from an NDS server + the underlying method to read data over an NDS server gwpy.timeseries.TimeSeries.read - for the underlying method to read from a local file cache + the underlying method to read data from local files """ # get TimeSeries class if isinstance(channel, (list, tuple)): series_class = TimeSeriesDict else: series_class = TimeSeries - try: # read from frame files - source = source or gwdatafind.find_urls( - obs or frametype[0], frametype, start, end) - if isinstance(channel, (list, tuple)): - channel = remove_missing_channels(channel, source) - return series_class.read( - source, channel, start=start, end=end, nproc=nproc, - verbose=verbose, **kwargs) - except (HTTPError, TypeError): # frame files not found + + try: # locate frame files + if frametype is not None: + ifo = re.search('[A-Z]1', frametype).group(0) + obs = ifo[0] + source = gwdatafind.find_urls(obs, frametype, start, end) + except HTMLError: # frame files not found pass + else: # read from frame files + if isinstance(source, list) and isinstance(channel, (list, tuple)): + channel = remove_missing_channels(channel, source) + if source is not None: + return series_class.read( + source, channel, start=start, end=end, nproc=nproc, + verbose=verbose, **kwargs) + + # read single channel from NDS if not isinstance(channel, (list, tuple)): return series_class.get( channel, start, end, verbose=verbose, **kwargs) + # if all else fails, process channels in groups of 60 data = series_class() for group in [channel[i:i + 60] for i in range(0, len(channel), 60)]: diff --git a/gwdetchar/io/tests/test_datafind.py b/gwdetchar/io/tests/test_datafind.py index e00c1c445..b6cd3eda8 100644 --- a/gwdetchar/io/tests/test_datafind.py +++ b/gwdetchar/io/tests/test_datafind.py @@ -72,7 +72,7 @@ def test_get_data_from_NDS(tsget): start = 0 end = 64 channel = 'X1:TEST-STRAIN' - data = datafind.get_data(channel, start, end, source=0) + data = datafind.get_data(channel, start, end) # test data products assert isinstance(data, TimeSeries) @@ -86,7 +86,7 @@ def test_get_data_dict_from_NDS(tsdget): start = 33 end = 64 channels = ['X1:TEST-STRAIN'] - data = datafind.get_data(channels, start, end, source=0) + data = datafind.get_data(channels, start, end) # test data products assert isinstance(data, TimeSeriesDict) From d8fbd741df78748490557c7d04d511f54f44ebd0 Mon Sep 17 00:00:00 2001 From: Alexander Urban Date: Fri, 5 Apr 2019 08:45:38 -0500 Subject: [PATCH 4/6] Invert try-except block --- gwdetchar/io/datafind.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/gwdetchar/io/datafind.py b/gwdetchar/io/datafind.py index 655a6a6fe..0729cd6aa 100644 --- a/gwdetchar/io/datafind.py +++ b/gwdetchar/io/datafind.py @@ -187,20 +187,19 @@ def get_data(channel, start, end, frametype=None, source=None, else: series_class = TimeSeries - try: # locate frame files - if frametype is not None: + if frametype is not None: + try: # locate frame files ifo = re.search('[A-Z]1', frametype).group(0) obs = ifo[0] source = gwdatafind.find_urls(obs, frametype, start, end) - except HTMLError: # frame files not found - pass - else: # read from frame files - if isinstance(source, list) and isinstance(channel, (list, tuple)): - channel = remove_missing_channels(channel, source) - if source is not None: - return series_class.read( - source, channel, start=start, end=end, nproc=nproc, - verbose=verbose, **kwargs) + except HTMLError: # frame files not found + pass + if isinstance(source, list) and isinstance(channel, (list, tuple)): + channel = remove_missing_channels(channel, source) + if source is not None: # read from frame files + return series_class.read( + source, channel, start=start, end=end, nproc=nproc, + verbose=verbose, **kwargs) # read single channel from NDS if not isinstance(channel, (list, tuple)): From f2b9f1beddf84e2d59d5cbf171099d50cdb43130 Mon Sep 17 00:00:00 2001 From: Alexander Urban Date: Fri, 5 Apr 2019 09:08:30 -0500 Subject: [PATCH 5/6] Gracefully handle a bad frametype --- gwdetchar/io/datafind.py | 5 ++++- gwdetchar/io/tests/test_datafind.py | 7 +++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/gwdetchar/io/datafind.py b/gwdetchar/io/datafind.py index 0729cd6aa..157a0baa6 100644 --- a/gwdetchar/io/datafind.py +++ b/gwdetchar/io/datafind.py @@ -192,7 +192,10 @@ def get_data(channel, start, end, frametype=None, source=None, ifo = re.search('[A-Z]1', frametype).group(0) obs = ifo[0] source = gwdatafind.find_urls(obs, frametype, start, end) - except HTMLError: # frame files not found + except AttributeError: + raise AttributeError( + 'Could not determine observatory from frametype') + except HTTPError: # frame files not found pass if isinstance(source, list) and isinstance(channel, (list, tuple)): channel = remove_missing_channels(channel, source) diff --git a/gwdetchar/io/tests/test_datafind.py b/gwdetchar/io/tests/test_datafind.py index b6cd3eda8..c41d57921 100644 --- a/gwdetchar/io/tests/test_datafind.py +++ b/gwdetchar/io/tests/test_datafind.py @@ -128,3 +128,10 @@ def test_get_data_dict_from_cache(tsdget, remove): assert data[channels[0]].span == Segment(start, end) nptest.assert_array_equal(data[channels[0]].value, HOFT.crop(start, end).value) + + +def test_get_data_bad_frametype(): + channel = 'X1:TEST-STRAIN' + with pytest.raises(AttributeError) as exc: + datafind.get_data(channel, start=0, end=32, frametype='bad_frametype') + assert 'Could not determine observatory' in str(exc.value) From ca4a2e02d4de61e2429072d9ae1ed051991a8306 Mon Sep 17 00:00:00 2001 From: Alexander Urban Date: Fri, 5 Apr 2019 14:28:41 -0500 Subject: [PATCH 6/6] More careful unit tests for observatory frametype --- gwdetchar/io/tests/test_datafind.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/gwdetchar/io/tests/test_datafind.py b/gwdetchar/io/tests/test_datafind.py index c41d57921..8e653271f 100644 --- a/gwdetchar/io/tests/test_datafind.py +++ b/gwdetchar/io/tests/test_datafind.py @@ -93,14 +93,19 @@ def test_get_data_dict_from_NDS(tsdget): nptest.assert_array_equal(data['X1:TEST-STRAIN'].value, HOFT.value) -@mock.patch('gwpy.timeseries.TimeSeries.read', - return_value=HOFT.crop(16, 48)) -def test_get_data_from_cache(tsget): +@mock.patch('gwdatafind.find_urls') +@mock.patch('gwpy.timeseries.TimeSeries.read') +def test_get_data_from_cache(tsget, find_data): + # set return values + find_data.return_value = ['test.gwf'] + tsget.return_value = HOFT.crop(16, 48) + # retrieve test frame start = 16 end = start + 32 channel = 'X1:TEST-STRAIN' - data = datafind.get_data(channel, start, end, source=True) + frametype = 'X1_TEST' + data = datafind.get_data(channel, start, end, frametype=frametype) # test data products assert isinstance(data, TimeSeries) @@ -109,13 +114,15 @@ def test_get_data_from_cache(tsget): nptest.assert_array_equal(data.value, HOFT.crop(start, end).value) +@mock.patch('gwdatafind.find_urls') @mock.patch('gwdetchar.io.datafind.remove_missing_channels') @mock.patch('gwpy.timeseries.TimeSeriesDict.read') -def test_get_data_dict_from_cache(tsdget, remove): +def test_get_data_dict_from_cache(tsdget, remove, find_data): # set return values tsdget.return_value = TimeSeriesDict({ 'X1:TEST-STRAIN': HOFT.crop(16, 48)}) remove.return_value = ['X1:TEST-STRAIN'] + find_data.return_value = ['test.gwf'] # retrieve test frame start = 16 end = start + 32