diff --git a/pydatajson/constants.py b/pydatajson/constants.py index f6d01fd..8ea221e 100644 --- a/pydatajson/constants.py +++ b/pydatajson/constants.py @@ -6,5 +6,4 @@ INVALID_STATUS_CODES_REGEX = ["^4[0-9]+$", "^5[0-9]+$"] EXCEPTION_STATUS_CODES = [429] -CANT_THREADS_BROKEN_URL_VALIDATOR = 10 DEFAULT_CHECK_TIMEOUT = 1 diff --git a/pydatajson/core.py b/pydatajson/core.py index 0f025e5..362ac1b 100644 --- a/pydatajson/core.py +++ b/pydatajson/core.py @@ -965,13 +965,15 @@ def _extract_datasets_to_harvest(cls, report): def generate_catalogs_indicators(self, catalogs=None, central_catalog=None, identifier_search=False, - broken_links=False): + broken_links=False, + broken_links_threads=1): catalogs = catalogs or self return indicators.generate_catalogs_indicators( catalogs, central_catalog, identifier_search=identifier_search, validator=self.validator, broken_links=broken_links, verify_ssl=self.verify_ssl, - url_check_timeout=self.url_check_timeout) + url_check_timeout=self.url_check_timeout, + broken_links_threads=broken_links_threads) def _count_fields_recursive(self, dataset, fields): """Cuenta la información de campos optativos/recomendados/requeridos diff --git a/pydatajson/indicators.py b/pydatajson/indicators.py index dfeab95..916df25 100644 --- a/pydatajson/indicators.py +++ b/pydatajson/indicators.py @@ -48,7 +48,8 @@ def generate_catalogs_indicators(catalogs, central_catalog=None, broken_links=False, validator=None, verify_ssl=True, - url_check_timeout=1): + url_check_timeout=1, + broken_links_threads=1): """Genera una lista de diccionarios con varios indicadores sobre los catálogos provistos, tales como la cantidad de datasets válidos, días desde su última fecha actualizada, entre otros. @@ -87,7 +88,8 @@ def generate_catalogs_indicators(catalogs, central_catalog=None, fields_count, result = _generate_indicators( catalog, validator=validator, broken_links=broken_links, verify_ssl=verify_ssl, - url_check_timeout=url_check_timeout) + url_check_timeout=url_check_timeout, + broken_links_threads=broken_links_threads) if central_catalog: result.update(_federation_indicators( catalog, central_catalog, identifier_search=identifier_search)) @@ -118,7 +120,7 @@ def generate_catalogs_indicators(catalogs, central_catalog=None, def _generate_indicators(catalog, validator=None, only_numeric=False, broken_links=False, verify_ssl=True, - url_check_timeout=1): + url_check_timeout=1, broken_links_threads=1): """Genera los indicadores de un catálogo individual. Args: @@ -138,7 +140,8 @@ def _generate_indicators(catalog, validator=None, only_numeric=False, if broken_links: result.update(_generate_valid_urls_indicators( catalog, validator=validator, verify_ssl=verify_ssl, - url_check_timeout=url_check_timeout)) + url_check_timeout=url_check_timeout, + threads_count=broken_links_threads)) # Genero los indicadores relacionados con fechas, y los agrego result.update( @@ -566,7 +569,7 @@ def _eventual_periodicity(periodicity): def _generate_valid_urls_indicators(catalog, validator=None, verify_ssl=True, - url_check_timeout=1): + url_check_timeout=1, threads_count=1): """Genera indicadores sobre el estado de las urls de distribuciones Args: @@ -581,7 +584,8 @@ def _generate_valid_urls_indicators(catalog, validator=None, verify_ssl=True, generator = \ StatusIndicatorsGenerator( catalog, validator=validator, verify_ssl=verify_ssl, - url_check_timeout=url_check_timeout) + url_check_timeout=url_check_timeout, + threads_count=threads_count) except Exception as e: msg = u'Error generando resumen del catálogo {}: {}'.format( catalog['title'], str(e)) diff --git a/pydatajson/status_indicators_generator.py b/pydatajson/status_indicators_generator.py index d2797ef..997b94a 100644 --- a/pydatajson/status_indicators_generator.py +++ b/pydatajson/status_indicators_generator.py @@ -10,7 +10,7 @@ class StatusIndicatorsGenerator(object): def __init__(self, catalog, validator=None, verify_ssl=True, - url_check_timeout=1): + url_check_timeout=1, threads_count=1): self.download_url_ok = None self.catalog = read_catalog(catalog) self.summary = generate_datasets_summary(self.catalog, @@ -18,6 +18,7 @@ def __init__(self, catalog, validator=None, verify_ssl=True, verify_ssl=verify_ssl) self.verify_url = verify_ssl self.url_check_timeout = url_check_timeout + self.threads_count = threads_count def datasets_cant(self): return len(self.summary) @@ -47,7 +48,8 @@ def distribuciones_download_url_ok_cant(self): if self.download_url_ok: return self.download_url_ok validator = DistributionDownloadUrlsValidator( - self.catalog, self.verify_url, self.url_check_timeout) + self.catalog, self.verify_url, self.url_check_timeout, + self.threads_count) self.download_url_ok = validator.validate() return self.download_url_ok diff --git a/pydatajson/validation.py b/pydatajson/validation.py index 504bca7..13781ff 100644 --- a/pydatajson/validation.py +++ b/pydatajson/validation.py @@ -54,11 +54,12 @@ def init_jsonschema_validator(self, schema_dir, schema_filename): schema=schema, resolver=resolver, format_checker=format_checker) def is_valid(self, catalog, broken_links=False, verify_ssl=True, - url_check_timeout=1): + url_check_timeout=1, broken_links_threads=1): return not self._get_errors(catalog, broken_links=broken_links, verify_ssl=verify_ssl, - url_check_timeout=url_check_timeout) + url_check_timeout=url_check_timeout, + broken_links_threads=broken_links_threads) def validate_catalog(self, catalog, only_errors=False, broken_links=False, verify_ssl=True, @@ -83,7 +84,7 @@ def validate_catalog(self, catalog, only_errors=False, return response def _get_errors(self, catalog, broken_links=False, verify_ssl=True, - url_check_timeout=1): + url_check_timeout=1, broken_links_threads=1): errors = list( self.jsonschema_validator.iter_errors(catalog) ) @@ -91,9 +92,11 @@ def _get_errors(self, catalog, broken_links=False, verify_ssl=True, for error in self._custom_errors( catalog, broken_links=broken_links, verify_ssl=verify_ssl, - url_check_timeout=url_check_timeout): + url_check_timeout=url_check_timeout, + broken_links_threads=broken_links_threads): errors.append(error) except Exception as e: + print(e) logger.warning("Error de validación") return errors @@ -124,7 +127,7 @@ def _default_response(self, catalog): # noinspection PyTypeChecker def _custom_errors(self, catalog, broken_links=False, verify_ssl=True, - url_check_timeout=1): + url_check_timeout=1, broken_links_threads=1): """Realiza validaciones sin usar el jsonschema. En esta función se agregan bloques de código en python que realizan @@ -133,9 +136,11 @@ def _custom_errors(self, catalog, broken_links=False, verify_ssl=True, validators = self._validators_for_catalog(catalog) if broken_links: validators.append(LandingPagesValidator(catalog, verify_ssl, - url_check_timeout)) + url_check_timeout, + broken_links_threads)) validators.append(DistributionUrlsValidator(catalog, verify_ssl, - url_check_timeout)) + url_check_timeout, + broken_links_threads)) for validator in validators: for error in validator.validate(): diff --git a/pydatajson/validators/distribution_download_urls_validator.py b/pydatajson/validators/distribution_download_urls_validator.py index 26c13af..ae25837 100644 --- a/pydatajson/validators/distribution_download_urls_validator.py +++ b/pydatajson/validators/distribution_download_urls_validator.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from pydatajson import threading_helper, constants +from pydatajson import threading_helper from pydatajson.validators.url_validator import UrlValidator @@ -15,7 +15,7 @@ def validate(self): async_results += threading_helper \ .apply_threading(distribution_urls, self.is_working_url, - constants.CANT_THREADS_BROKEN_URL_VALIDATOR) + self.threads_count) result = 0 for res, _ in async_results: diff --git a/pydatajson/validators/distribution_urls_validator.py b/pydatajson/validators/distribution_urls_validator.py index ae1c9a2..72952e0 100644 --- a/pydatajson/validators/distribution_urls_validator.py +++ b/pydatajson/validators/distribution_urls_validator.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- import pydatajson.custom_exceptions as ce -from pydatajson import threading_helper, constants +from pydatajson import threading_helper from pydatajson.validators.url_validator import UrlValidator @@ -30,7 +30,7 @@ def validate(self): sync_res = threading_helper \ .apply_threading(urls, self.is_working_url, - constants.CANT_THREADS_BROKEN_URL_VALIDATOR) + self.threads_count) for i in range(len(metadata)): actual_metadata = metadata[i] diff --git a/pydatajson/validators/landing_pages_validator.py b/pydatajson/validators/landing_pages_validator.py index 43ae680..f030b96 100644 --- a/pydatajson/validators/landing_pages_validator.py +++ b/pydatajson/validators/landing_pages_validator.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- import pydatajson.custom_exceptions as ce -from pydatajson import threading_helper, constants +from pydatajson import threading_helper from pydatajson.validators.url_validator import UrlValidator @@ -26,7 +26,7 @@ def validate(self): sync_res = threading_helper \ .apply_threading(urls, self.is_working_url, - constants.CANT_THREADS_BROKEN_URL_VALIDATOR) + self.threads_count) for i in range(len(sync_res)): valid, status_code = sync_res[i] diff --git a/pydatajson/validators/url_validator.py b/pydatajson/validators/url_validator.py index 3d969e8..739bddb 100644 --- a/pydatajson/validators/url_validator.py +++ b/pydatajson/validators/url_validator.py @@ -12,10 +12,11 @@ class UrlValidator(SimpleValidator): - def __init__(self, catalog, verify_ssl, url_check_timeout): + def __init__(self, catalog, verify_ssl, url_check_timeout, threads_count): super(UrlValidator, self).__init__(catalog) self.verify_ssl = verify_ssl self.url_check_timeout = url_check_timeout + self.threads_count = threads_count def validate(self): raise NotImplementedError diff --git a/tests/test_catalog_readme.py b/tests/test_catalog_readme.py index b8cb6e0..cfc631c 100644 --- a/tests/test_catalog_readme.py +++ b/tests/test_catalog_readme.py @@ -20,8 +20,6 @@ from pydatajson.catalog_readme import generate_readme from tests.support.decorators import RESULTS_DIR -import pydatajson.constants -pydatajson.constants.CANT_THREADS_BROKEN_URL_VALIDATOR = 1 my_vcr = vcr.VCR(path_transformer=vcr.VCR.ensure_suffix('.yaml'), cassette_library_dir=os.path.join("tests", "cassetes"), diff --git a/tests/test_readers_and_writers.py b/tests/test_readers_and_writers.py index 87f6089..ff881e8 100644 --- a/tests/test_readers_and_writers.py +++ b/tests/test_readers_and_writers.py @@ -12,7 +12,6 @@ import nose import vcr -from pydatajson import constants from tests.support.factories.xlsx import CSV_TABLE, WRITE_XLSX_TABLE from tests.support.factories.xlsx import READ_XLSX_TABLE @@ -28,8 +27,6 @@ from tests import xl_methods import openpyxl as pyxl -import pydatajson.constants -pydatajson.constants.CANT_THREADS_BROKEN_URL_VALIDATOR = 1 my_vcr = vcr.VCR( path_transformer=vcr.VCR.ensure_suffix('.yaml'), diff --git a/tests/test_validation.py b/tests/test_validation.py index de7048d..9631ed7 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -24,8 +24,6 @@ from .context import pydatajson from .support.decorators import RESULTS_DIR -import pydatajson.constants -pydatajson.constants.CANT_THREADS_BROKEN_URL_VALIDATOR = 1 my_vcr = vcr.VCR(path_transformer=vcr.VCR.ensure_suffix('.yaml'), cassette_library_dir=os.path.join("tests", "cassetes"), diff --git a/tests/test_validators.py b/tests/test_validators.py index 3cbc14b..a96d239 100644 --- a/tests/test_validators.py +++ b/tests/test_validators.py @@ -25,10 +25,6 @@ except ImportError: from unittest import mock -import pydatajson.constants - -pydatajson.constants.CANT_THREADS_BROKEN_URL_VALIDATOR = 1 - class ValidatorsTestCase(unittest.TestCase): SAMPLES_DIR = os.path.join("tests", "samples") @@ -46,27 +42,27 @@ def setUp(self): self.tinr_validator = \ ThemeIdsNotRepeatedValidator(self.catalog) self.ddu_validator = \ - DistributionUrlsValidator(self.catalog, True, 1) + DistributionUrlsValidator(self.catalog, True, 1, 10) self.lp_validator = \ - LandingPagesValidator(self.catalog, True, 1) + LandingPagesValidator(self.catalog, True, 1, 10) @requests_mock.Mocker() def test_is_working_url_valid_url(self, req_mock): - url_validator = UrlValidator(self.catalog, True, 1) + url_validator = UrlValidator(self.catalog, True, 1, 10) req_mock.head(self.test_url) self.assertEqual( (True, 200), url_validator.is_working_url(self.test_url)) @requests_mock.Mocker() def test_is_working_url_invalid_url(self, req_mock): - url_validator = UrlValidator(self.catalog, True, 1) + url_validator = UrlValidator(self.catalog, True, 1, 10) req_mock.head(self.test_url, status_code=400) self.assertEqual( (False, 400), url_validator.is_working_url(self.test_url)) @requests_mock.Mocker() def test_is_working_url_too_many_requests_response(self, req_mock): - url_validator = UrlValidator(self.catalog, True, 1) + url_validator = UrlValidator(self.catalog, True, 1, 10) too_many_request_status_code = 429 req_mock.head(self.test_url, status_code=too_many_request_status_code) @@ -76,20 +72,20 @@ def test_is_working_url_too_many_requests_response(self, req_mock): @requests_mock.Mocker() def test_is_working_url_url_with_exception(self, req_mock): - url_validator = UrlValidator(self.catalog, True, 1) + url_validator = UrlValidator(self.catalog, True, 1, 10) req_mock.head(self.test_url, exc=ConnectionError) self.assertEqual( (False, None), url_validator.is_working_url(self.test_url)) @requests_mock.Mocker() def test_is_working_url_url_with_timeout(self, req_mock): - url_validator = UrlValidator(self.catalog, True, 1) + url_validator = UrlValidator(self.catalog, True, 1, 10) req_mock.head(self.test_url, exc=Timeout) self.assertEqual( (False, 408), url_validator.is_working_url(self.test_url)) def test_is_working_url_malformed_values(self): - url_validator = UrlValidator(self.catalog, True, 1) + url_validator = UrlValidator(self.catalog, True, 1, 10) self.assertEqual( (False, None), url_validator.is_working_url('malformed_value')) self.assertEqual( @@ -99,7 +95,7 @@ def test_is_working_url_malformed_values(self): def test_valid_landing_page_validator(self): lp_validator = \ - LandingPagesValidator(self.catalog, True, 1) + LandingPagesValidator(self.catalog, True, 1, 10) with mock.patch( 'pydatajson' '.validators' @@ -110,7 +106,7 @@ def test_valid_landing_page_validator(self): def test_invalid_landing_page_validator(self): lp_validator = \ - LandingPagesValidator(self.catalog, True, 1) + LandingPagesValidator(self.catalog, True, 1, 10) with mock.patch( 'pydatajson' '.validators' @@ -121,7 +117,7 @@ def test_invalid_landing_page_validator(self): def test_valid_distribution_url_validator(self): ddu_validator = \ - DistributionUrlsValidator(self.catalog, True, 1) + DistributionUrlsValidator(self.catalog, True, 1, 10) with mock.patch( 'pydatajson' '.validators' @@ -132,7 +128,7 @@ def test_valid_distribution_url_validator(self): def test_invalid_distribution_url_validator(self): ddu_validator = \ - DistributionUrlsValidator(self.catalog, True, 1) + DistributionUrlsValidator(self.catalog, True, 1, 10) with mock.patch( 'pydatajson' '.validators' @@ -191,7 +187,7 @@ def test_invalid_theme_ids_not_repeated_validator(self): @requests_mock.Mocker() def test_url_check_timeout(self, req_mock): - url_validator = UrlValidator(self.catalog, True, 100) + url_validator = UrlValidator(self.catalog, True, 100, 10) req_mock.head(self.test_url) url_validator.is_working_url(self.test_url) self.assertEqual(100, req_mock.request_history[0].timeout)