diff --git a/pydatajson/constants.py b/pydatajson/constants.py index cf0fb66..f6d01fd 100644 --- a/pydatajson/constants.py +++ b/pydatajson/constants.py @@ -7,3 +7,4 @@ EXCEPTION_STATUS_CODES = [429] CANT_THREADS_BROKEN_URL_VALIDATOR = 10 +DEFAULT_CHECK_TIMEOUT = 1 diff --git a/pydatajson/core.py b/pydatajson/core.py index 90ca6f0..0f025e5 100644 --- a/pydatajson/core.py +++ b/pydatajson/core.py @@ -54,7 +54,8 @@ class DataJson(dict): def __init__(self, catalog=None, schema_filename=None, schema_dir=None, default_values=None, catalog_format=None, validator_class=Validator, verify_ssl=False, - requests_timeout=constants.REQUESTS_TIMEOUT): + requests_timeout=constants.REQUESTS_TIMEOUT, + url_check_timeout=constants.DEFAULT_CHECK_TIMEOUT): """Lee un catálogo y crea un objeto con funciones para manipularlo. Salvo que se indique lo contrario, se utiliza como default el schema @@ -81,6 +82,7 @@ def __init__(self, catalog=None, schema_filename=None, schema_dir=None, """ self.verify_ssl = verify_ssl self.requests_timeout = requests_timeout + self.url_check_timeout = url_check_timeout # se construye el objeto DataJson con la interfaz de un dicconario if catalog: @@ -245,14 +247,15 @@ def is_valid_catalog(self, catalog=None, broken_links=False): Args: catalog (str o dict): Catálogo (dict, JSON o XLSX) a ser validado. Si no se pasa, valida este catálogo. + broken_links(bool): Activa el checkeo de estados de urls Returns: bool: True si el data.json cumple con el schema, sino False. """ catalog = self._read_catalog(catalog) if catalog else self - return self.validator.is_valid(catalog, - broken_links=broken_links, - verify_ssl=self.verify_ssl) + return self.validator.is_valid( + catalog, broken_links=broken_links, verify_ssl=self.verify_ssl, + url_check_timeout=self.url_check_timeout) @staticmethod def _update_validation_response(error, response): @@ -344,10 +347,9 @@ def validate_catalog(self, catalog=None, only_errors=False, fmt="dict", """ catalog = self._read_catalog(catalog) if catalog else self - validation = self.validator.validate_catalog(catalog, - only_errors, - broken_links, - self.verify_ssl) + validation = self.validator.validate_catalog( + catalog, only_errors, broken_links, self.verify_ssl, + self.url_check_timeout) if export_path: fmt = 'table' @@ -968,7 +970,8 @@ def generate_catalogs_indicators(self, catalogs=None, return indicators.generate_catalogs_indicators( catalogs, central_catalog, identifier_search=identifier_search, validator=self.validator, broken_links=broken_links, - verify_ssl=self.verify_ssl) + verify_ssl=self.verify_ssl, + url_check_timeout=self.url_check_timeout) def _count_fields_recursive(self, dataset, fields): """Cuenta la información de campos optativos/recomendados/requeridos diff --git a/pydatajson/indicators.py b/pydatajson/indicators.py index bbf4ffb..dfeab95 100644 --- a/pydatajson/indicators.py +++ b/pydatajson/indicators.py @@ -47,7 +47,8 @@ def generate_catalogs_indicators(catalogs, central_catalog=None, identifier_search=False, broken_links=False, validator=None, - verify_ssl=True): + verify_ssl=True, + url_check_timeout=1): """Genera una lista de diccionarios con varios indicadores sobre los catálogos provistos, tales como la cantidad de datasets válidos, días desde su última fecha actualizada, entre otros. @@ -85,7 +86,8 @@ def generate_catalogs_indicators(catalogs, central_catalog=None, fields_count, result = _generate_indicators( catalog, validator=validator, - broken_links=broken_links, verify_ssl=verify_ssl) + broken_links=broken_links, verify_ssl=verify_ssl, + url_check_timeout=url_check_timeout) if central_catalog: result.update(_federation_indicators( catalog, central_catalog, identifier_search=identifier_search)) @@ -115,7 +117,8 @@ def generate_catalogs_indicators(catalogs, central_catalog=None, def _generate_indicators(catalog, validator=None, only_numeric=False, - broken_links=False, verify_ssl=True): + broken_links=False, verify_ssl=True, + url_check_timeout=1): """Genera los indicadores de un catálogo individual. Args: @@ -133,9 +136,9 @@ def _generate_indicators(catalog, validator=None, only_numeric=False, # Genero indicadores relacionados con validacion de urls if broken_links: - result.update(_generate_valid_urls_indicators(catalog, - validator=validator, - verify_ssl=verify_ssl)) + result.update(_generate_valid_urls_indicators( + catalog, validator=validator, verify_ssl=verify_ssl, + url_check_timeout=url_check_timeout)) # Genero los indicadores relacionados con fechas, y los agrego result.update( @@ -562,7 +565,8 @@ def _eventual_periodicity(periodicity): return periodicity in ('eventual', 'EVENTUAL') -def _generate_valid_urls_indicators(catalog, validator=None, verify_ssl=True): +def _generate_valid_urls_indicators(catalog, validator=None, verify_ssl=True, + url_check_timeout=1): """Genera indicadores sobre el estado de las urls de distribuciones Args: @@ -575,9 +579,9 @@ def _generate_valid_urls_indicators(catalog, validator=None, verify_ssl=True): result = {} try: generator = \ - StatusIndicatorsGenerator(catalog, - validator=validator, - verify_ssl=verify_ssl) + StatusIndicatorsGenerator( + catalog, validator=validator, verify_ssl=verify_ssl, + url_check_timeout=url_check_timeout) except Exception as e: msg = u'Error generando resumen del catálogo {}: {}'.format( catalog['title'], str(e)) diff --git a/pydatajson/reporting.py b/pydatajson/reporting.py index 7ea5b4f..05f55ce 100644 --- a/pydatajson/reporting.py +++ b/pydatajson/reporting.py @@ -17,8 +17,8 @@ from .validation import validate_catalog -def generate_datasets_summary(catalog, export_path=None, - validator=None, verify_ssl=True): +def generate_datasets_summary(catalog, export_path=None, validator=None, + verify_ssl=True, url_check_timeout=1): """Genera un informe sobre los datasets presentes en un catálogo, indicando para cada uno: - Índice en la lista catalog["dataset"] @@ -53,9 +53,8 @@ def generate_datasets_summary(catalog, export_path=None, datasets = [] validation = validate_catalog( - catalog, - validator=validator, - verify_ssl=verify_ssl)["error"]["dataset"] + catalog, validator=validator, verify_ssl=verify_ssl, + url_check_timeout=url_check_timeout)["error"]["dataset"] def info_dataset(index, dataset): """Recolecta información básica de un dataset.""" diff --git a/pydatajson/status_indicators_generator.py b/pydatajson/status_indicators_generator.py index a7c5a76..d2797ef 100644 --- a/pydatajson/status_indicators_generator.py +++ b/pydatajson/status_indicators_generator.py @@ -9,13 +9,15 @@ class StatusIndicatorsGenerator(object): - def __init__(self, catalog, validator=None, verify_ssl=True): + def __init__(self, catalog, validator=None, verify_ssl=True, + url_check_timeout=1): self.download_url_ok = None self.catalog = read_catalog(catalog) self.summary = generate_datasets_summary(self.catalog, validator=validator, verify_ssl=verify_ssl) self.verify_url = verify_ssl + self.url_check_timeout = url_check_timeout def datasets_cant(self): return len(self.summary) @@ -45,7 +47,7 @@ def distribuciones_download_url_ok_cant(self): if self.download_url_ok: return self.download_url_ok validator = DistributionDownloadUrlsValidator( - self.catalog, self.verify_url) + self.catalog, self.verify_url, self.url_check_timeout) self.download_url_ok = validator.validate() return self.download_url_ok diff --git a/pydatajson/validation.py b/pydatajson/validation.py index baf9ca2..504bca7 100644 --- a/pydatajson/validation.py +++ b/pydatajson/validation.py @@ -53,16 +53,21 @@ def init_jsonschema_validator(self, schema_dir, schema_filename): return jsonschema.Draft4Validator( schema=schema, resolver=resolver, format_checker=format_checker) - def is_valid(self, catalog, broken_links=False, verify_ssl=True): + def is_valid(self, catalog, broken_links=False, verify_ssl=True, + url_check_timeout=1): return not self._get_errors(catalog, broken_links=broken_links, - verify_ssl=verify_ssl) + verify_ssl=verify_ssl, + url_check_timeout=url_check_timeout) def validate_catalog(self, catalog, only_errors=False, - broken_links=False, verify_ssl=True): + broken_links=False, verify_ssl=True, + url_check_timeout=1): + default_response = self._default_response(catalog) errors = self._get_errors(catalog, broken_links=broken_links, - verify_ssl=verify_ssl) + verify_ssl=verify_ssl, + url_check_timeout=url_check_timeout) response = default_response.copy() for error in errors: @@ -77,14 +82,16 @@ def validate_catalog(self, catalog, only_errors=False, return response - def _get_errors(self, catalog, broken_links=False, verify_ssl=True): + def _get_errors(self, catalog, broken_links=False, verify_ssl=True, + url_check_timeout=1): errors = list( self.jsonschema_validator.iter_errors(catalog) ) try: - for error in self._custom_errors(catalog, - broken_links=broken_links, - verify_ssl=verify_ssl): + for error in self._custom_errors( + catalog, broken_links=broken_links, + verify_ssl=verify_ssl, + url_check_timeout=url_check_timeout): errors.append(error) except Exception as e: logger.warning("Error de validación") @@ -116,7 +123,8 @@ def _default_response(self, catalog): } # noinspection PyTypeChecker - def _custom_errors(self, catalog, broken_links=False, verify_ssl=True): + def _custom_errors(self, catalog, broken_links=False, verify_ssl=True, + url_check_timeout=1): """Realiza validaciones sin usar el jsonschema. En esta función se agregan bloques de código en python que realizan @@ -124,8 +132,10 @@ def _custom_errors(self, catalog, broken_links=False, verify_ssl=True): """ validators = self._validators_for_catalog(catalog) if broken_links: - validators.append(LandingPagesValidator(catalog, verify_ssl)) - validators.append(DistributionUrlsValidator(catalog, verify_ssl)) + validators.append(LandingPagesValidator(catalog, verify_ssl, + url_check_timeout)) + validators.append(DistributionUrlsValidator(catalog, verify_ssl, + url_check_timeout)) for validator in validators: for error in validator.validate(): @@ -175,7 +185,8 @@ def _update_validation_response(self, error, response): return new_response -def is_valid_catalog(catalog, validator=None, verify_ssl=True): +def is_valid_catalog(catalog, validator=None, verify_ssl=True, + url_check_timeout=1): """Valida que un archivo `data.json` cumpla con el schema definido. Chequea que el data.json tiene todos los campos obligatorios y que @@ -195,12 +206,13 @@ def is_valid_catalog(catalog, validator=None, verify_ssl=True): else: validator = Validator() - return validator.is_valid(catalog, verify_ssl=verify_ssl) + return validator.is_valid(catalog, verify_ssl=verify_ssl, + url_check_timeout=url_check_timeout) def validate_catalog(catalog, only_errors=False, fmt="dict", export_path=None, validator=None, - verify_ssl=True): + verify_ssl=True, url_check_timeout=1): """Analiza un data.json registrando los errores que encuentra. Chequea que el data.json tiene todos los campos obligatorios y que @@ -260,4 +272,5 @@ def validate_catalog(catalog, only_errors=False, fmt="dict", return validator.validate_catalog(catalog, only_errors, - verify_ssl=verify_ssl) + verify_ssl=verify_ssl, + url_check_timeout=url_check_timeout) diff --git a/pydatajson/validators/url_validator.py b/pydatajson/validators/url_validator.py index 63654af..3d969e8 100644 --- a/pydatajson/validators/url_validator.py +++ b/pydatajson/validators/url_validator.py @@ -12,16 +12,19 @@ class UrlValidator(SimpleValidator): - def __init__(self, catalog, verify_ssl): + def __init__(self, catalog, verify_ssl, url_check_timeout): super(UrlValidator, self).__init__(catalog) self.verify_ssl = verify_ssl + self.url_check_timeout = url_check_timeout def validate(self): raise NotImplementedError def is_working_url(self, url): try: - response = requests.head(url, timeout=1, verify=self.verify_ssl) + response = requests.head(url, + timeout=self.url_check_timeout, + verify=self.verify_ssl) matches = [] if response.status_code not in EXCEPTION_STATUS_CODES: matches = \ diff --git a/tests/test_status_indicators_generator.py b/tests/test_status_indicators_generator.py index f0965b6..3bd5598 100644 --- a/tests/test_status_indicators_generator.py +++ b/tests/test_status_indicators_generator.py @@ -162,3 +162,19 @@ def test_full_distribuciones_download_url_ok_pct(self, req_mock): def test_empty_distribuciones_download_url_ok_pct(self): self.assertEqual( None, self.gen_empty.distribuciones_download_url_ok_pct()) + + @requests_mock.Mocker() + def test_check_url_default_timeout(self, req_mock): + req_mock.head(requests_mock.ANY, text='resp') + self.gen_justicia.distribuciones_download_url_ok_pct() + for request in req_mock.request_history: + self.assertEqual(1, request.timeout) + + @requests_mock.Mocker() + def test_check_url_override_timeout(self, req_mock): + generator = StatusIndicatorsGenerator( + self.get_sample('catalogo_justicia.json'), url_check_timeout=10) + req_mock.head(requests_mock.ANY, text='resp') + generator.distribuciones_download_url_ok_pct() + for request in req_mock.request_history: + self.assertEqual(10, request.timeout) diff --git a/tests/test_validators.py b/tests/test_validators.py index 1464909..3cbc14b 100644 --- a/tests/test_validators.py +++ b/tests/test_validators.py @@ -46,27 +46,27 @@ def setUp(self): self.tinr_validator = \ ThemeIdsNotRepeatedValidator(self.catalog) self.ddu_validator = \ - DistributionUrlsValidator(self.catalog, True) + DistributionUrlsValidator(self.catalog, True, 1) self.lp_validator = \ - LandingPagesValidator(self.catalog, True) + LandingPagesValidator(self.catalog, True, 1) @requests_mock.Mocker() def test_is_working_url_valid_url(self, req_mock): - url_validator = UrlValidator(self.catalog, True) + url_validator = UrlValidator(self.catalog, True, 1) req_mock.head(self.test_url) self.assertEqual( (True, 200), url_validator.is_working_url(self.test_url)) @requests_mock.Mocker() def test_is_working_url_invalid_url(self, req_mock): - url_validator = UrlValidator(self.catalog, True) + url_validator = UrlValidator(self.catalog, True, 1) req_mock.head(self.test_url, status_code=400) self.assertEqual( (False, 400), url_validator.is_working_url(self.test_url)) @requests_mock.Mocker() def test_is_working_url_too_many_requests_response(self, req_mock): - url_validator = UrlValidator(self.catalog, True) + url_validator = UrlValidator(self.catalog, True, 1) too_many_request_status_code = 429 req_mock.head(self.test_url, status_code=too_many_request_status_code) @@ -76,20 +76,20 @@ def test_is_working_url_too_many_requests_response(self, req_mock): @requests_mock.Mocker() def test_is_working_url_url_with_exception(self, req_mock): - url_validator = UrlValidator(self.catalog, True) + url_validator = UrlValidator(self.catalog, True, 1) req_mock.head(self.test_url, exc=ConnectionError) self.assertEqual( (False, None), url_validator.is_working_url(self.test_url)) @requests_mock.Mocker() def test_is_working_url_url_with_timeout(self, req_mock): - url_validator = UrlValidator(self.catalog, True) + url_validator = UrlValidator(self.catalog, True, 1) req_mock.head(self.test_url, exc=Timeout) self.assertEqual( (False, 408), url_validator.is_working_url(self.test_url)) def test_is_working_url_malformed_values(self): - url_validator = UrlValidator(self.catalog, True) + url_validator = UrlValidator(self.catalog, True, 1) self.assertEqual( (False, None), url_validator.is_working_url('malformed_value')) self.assertEqual( @@ -99,7 +99,7 @@ def test_is_working_url_malformed_values(self): def test_valid_landing_page_validator(self): lp_validator = \ - LandingPagesValidator(self.catalog, True) + LandingPagesValidator(self.catalog, True, 1) with mock.patch( 'pydatajson' '.validators' @@ -110,7 +110,7 @@ def test_valid_landing_page_validator(self): def test_invalid_landing_page_validator(self): lp_validator = \ - LandingPagesValidator(self.catalog, True) + LandingPagesValidator(self.catalog, True, 1) with mock.patch( 'pydatajson' '.validators' @@ -121,7 +121,7 @@ def test_invalid_landing_page_validator(self): def test_valid_distribution_url_validator(self): ddu_validator = \ - DistributionUrlsValidator(self.catalog, True) + DistributionUrlsValidator(self.catalog, True, 1) with mock.patch( 'pydatajson' '.validators' @@ -132,7 +132,7 @@ def test_valid_distribution_url_validator(self): def test_invalid_distribution_url_validator(self): ddu_validator = \ - DistributionUrlsValidator(self.catalog, True) + DistributionUrlsValidator(self.catalog, True, 1) with mock.patch( 'pydatajson' '.validators' @@ -188,3 +188,10 @@ def test_invalid_theme_ids_not_repeated_validator(self): return_value=['convocatorias']): res = tinr_validator.validate() self.assertNotEqual(0, len(list(res))) + + @requests_mock.Mocker() + def test_url_check_timeout(self, req_mock): + url_validator = UrlValidator(self.catalog, True, 100) + req_mock.head(self.test_url) + url_validator.is_working_url(self.test_url) + self.assertEqual(100, req_mock.request_history[0].timeout)