diff --git a/HISTORY.md b/HISTORY.md index 7bbfc03..c0dc90d 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,11 @@ History ======= +0.3.19 (2017-10-31) +------------------- + +* Agrego métodos de búsqueda de series de tiempo en un catálogo (`get_time_series()`) y un parámetro `only_time_series=True or False` para filtrar datasets y distribuciones en sus métodos de búsqueda (`get_datasets(only_time_series=True)` devuelve sólo aquellos datasets que tengan alguna serie de tiempo). + 0.3.18 (2017-10-19) ------------------- diff --git a/pydatajson/__init__.py b/pydatajson/__init__.py index 6156ac3..d7449d5 100644 --- a/pydatajson/__init__.py +++ b/pydatajson/__init__.py @@ -10,4 +10,4 @@ __author__ = """Datos Argentina""" __email__ = 'datos@modernizacion.gob.ar' -__version__ = '0.3.18' +__version__ = '0.3.19' diff --git a/pydatajson/core.py b/pydatajson/core.py index dfec3ac..92001ec 100644 --- a/pydatajson/core.py +++ b/pydatajson/core.py @@ -94,6 +94,8 @@ def __init__(self, catalog=None, schema_filename=None, schema_dir=None, distributions = property(get_distributions) get_fields = search.get_fields fields = property(get_fields) + get_time_series = search.get_time_series + time_series = property(get_time_series) get_dataset = search.get_dataset get_distribution = search.get_distribution get_field = search.get_field diff --git a/pydatajson/search.py b/pydatajson/search.py index 9fef537..d890671 100644 --- a/pydatajson/search.py +++ b/pydatajson/search.py @@ -14,6 +14,7 @@ from functools import partial from time_series import distribution_has_time_index, dataset_has_time_series +from time_series import field_is_time_series from readers import read_catalog import custom_exceptions as ce @@ -101,21 +102,26 @@ def get_distributions(catalog, filter_in=None, filter_out=None, return filtered_distributions -def get_fields(catalog, filter_in=None, filter_out=None, meta_field=None): +def get_fields(catalog, filter_in=None, filter_out=None, meta_field=None, + only_time_series=False): filter_in = filter_in or {} filter_out = filter_out or {} catalog = read_catalog(catalog) fields = [] - for distribution in get_distributions(catalog, filter_in, filter_out): + for distribution in get_distributions(catalog, filter_in, filter_out, + only_time_series=only_time_series): if "field" in distribution and isinstance(distribution["field"], list): for field in distribution["field"]: - # agrega el id del dataset - field["dataset_identifier"] = distribution[ - "dataset_identifier"] - # agrega el id de la distribución - field["distribution_identifier"] = distribution["identifier"] - fields.append(field) + if not only_time_series or field_is_time_series(field, + distribution): + # agrega el id del dataset + field["dataset_identifier"] = distribution[ + "dataset_identifier"] + # agrega el id de la distribución + field["distribution_identifier"] = distribution[ + "identifier"] + fields.append(field) filtered_fields = filter( lambda x: _filter_dictionary( @@ -130,6 +136,11 @@ def get_fields(catalog, filter_in=None, filter_out=None, meta_field=None): return filtered_fields +def get_time_series(catalog, **kwargs): + kwargs["only_time_series"] = True + return get_fields(catalog, **kwargs) + + def get_dataset(catalog, identifier=None, title=None): msg = "Se requiere un 'identifier' o 'title' para buscar el dataset." assert identifier or title, msg diff --git a/pydatajson/time_series.py b/pydatajson/time_series.py index 1c8f392..f914ba1 100644 --- a/pydatajson/time_series.py +++ b/pydatajson/time_series.py @@ -13,6 +13,22 @@ import os +def field_is_time_series(field, distribution=None): + field_may_be_ts = ( + not field.get("specialType") and + not field.get("specialTypeDetail") and + ( + field.get("type", "").lower() == "number" or + field.get("type", "").lower() == "integer" + ) and + field.get("id") + ) + distribution_may_has_ts = ( + not distribution or distribution_has_time_index(distribution) + ) + return field_may_be_ts and distribution_may_has_ts + + def distribution_has_time_index(distribution): for field in distribution.get('field', []): if field.get('specialType') == 'time_index': diff --git a/setup.py b/setup.py index 71a69eb..c37eefd 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ setup( name='pydatajson', - version='0.3.18', + version='0.3.19', description="Paquete en python con herramientas para generar y validar metadatos de catálogos de datos en formato data.json.", long_description=readme + '\n\n' + history, author="Datos Argentina", diff --git a/tests/results/distributions_only_time_series.json b/tests/results/distributions_only_time_series.json index 066a6ba..df4d92e 100644 --- a/tests/results/distributions_only_time_series.json +++ b/tests/results/distributions_only_time_series.json @@ -61,7 +61,6 @@ "scrapingIdentifierCell": "F46", "units": "Millones de pesos a precios de 1993", "type": "number", - "id": "1.2_DGCP_D_1993_T_27", "scrapingDataStartCell": "F47" }, { @@ -69,7 +68,7 @@ "title": "demanda_global_consumo_publico", "scrapingIdentifierCell": "G46", "units": "Millones de pesos a precios de 1993", - "type": "number", + "type": "string", "id": "1.2_DGCP_D_1993_T_30", "scrapingDataStartCell": "G47" } diff --git a/tests/results/time_series.json b/tests/results/time_series.json new file mode 100644 index 0000000..53ce205 --- /dev/null +++ b/tests/results/time_series.json @@ -0,0 +1,46 @@ +[ + { + "distribution_identifier": "1.2", + "description": "PIB a precios de comprador, en millones de pesos de 1993 y valores anuales.", + "title": "oferta_global_pib", + "dataset_identifier": "1", + "scrapingIdentifierCell": "B46", + "units": "Millones de pesos a precios de 1993", + "type": "number", + "id": "1.2_OGP_D_1993_T_17", + "scrapingDataStartCell": "B47" + }, + { + "distribution_identifier": "1.2", + "description": "Importación a precios de comprador, en millones de pesos de 1993 y valores anuales.", + "title": "oferta_global_importacion", + "dataset_identifier": "1", + "scrapingIdentifierCell": "C46", + "units": "Millones de pesos a precios de 1993", + "type": "number", + "id": "1.2_OGI_D_1993_T_25", + "scrapingDataStartCell": "C47" + }, + { + "distribution_identifier": "1.2", + "description": "Oferta global total a precios de comprador, en millones de pesos de 1993 y valores anuales.", + "title": "demanda_global_exportacion", + "dataset_identifier": "1", + "scrapingIdentifierCell": "D46", + "units": "Millones de pesos a precios de 1993", + "type": "number", + "id": "1.2_DGE_D_1993_T_26", + "scrapingDataStartCell": "D47" + }, + { + "distribution_identifier": "1.2", + "description": "Consumo privado, en millones de pesos de 1993 y valores anuales.", + "title": "demanda_global_ibif", + "dataset_identifier": "1", + "scrapingIdentifierCell": "E46", + "units": "Millones de pesos a precios de 1993", + "type": "number", + "id": "1.2_DGI_D_1993_T_19", + "scrapingDataStartCell": "E47" + } +] diff --git a/tests/samples/time_series_data.json b/tests/samples/time_series_data.json index fb034a0..b9b3701 100644 --- a/tests/samples/time_series_data.json +++ b/tests/samples/time_series_data.json @@ -607,13 +607,12 @@ "type": "number", "scrapingIdentifierCell": "F46", "scrapingDataStartCell": "F47", - "units": "Millones de pesos a precios de 1993", - "id": "1.2_DGCP_D_1993_T_27" + "units": "Millones de pesos a precios de 1993" }, { "description": "Inversion bruta interna fija, en millones de pesos de 1993 y valores anuales.", "title": "demanda_global_consumo_publico", - "type": "number", + "type": "string", "scrapingIdentifierCell": "G46", "scrapingDataStartCell": "G47", "units": "Millones de pesos a precios de 1993", diff --git a/tests/test_search.py b/tests/test_search.py index 9963f07..7781031 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -77,6 +77,12 @@ def test_fields(self, expected_result): pprint(fields) self.assertEqual(expected_result, fields) + @load_expected_result() + def test_time_series(self, expected_result): + time_series = pydatajson.search.get_time_series(self.catalog_ts) + pprint(time_series) + self.assertEqual(expected_result, time_series) + @load_expected_result() def test_datasets_filter_in(self, expected_result): datasets = pydatajson.search.get_datasets(