Skip to content

Commit

Permalink
parametrizo el timeout de chequeo de urls
Browse files Browse the repository at this point in the history
  • Loading branch information
lrromero committed Oct 28, 2019
1 parent b1ba69e commit 08837b6
Show file tree
Hide file tree
Showing 8 changed files with 80 additions and 55 deletions.
1 change: 1 addition & 0 deletions pydatajson/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
EXCEPTION_STATUS_CODES = [429]

CANT_THREADS_BROKEN_URL_VALIDATOR = 10
DEFAULT_CHECK_TIMEOUT = 1
21 changes: 12 additions & 9 deletions pydatajson/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ class DataJson(dict):
def __init__(self, catalog=None, schema_filename=None, schema_dir=None,
default_values=None, catalog_format=None,
validator_class=Validator, verify_ssl=False,
requests_timeout=constants.REQUESTS_TIMEOUT):
requests_timeout=constants.REQUESTS_TIMEOUT,
url_check_timeout=constants.DEFAULT_CHECK_TIMEOUT):
"""Lee un catálogo y crea un objeto con funciones para manipularlo.
Salvo que se indique lo contrario, se utiliza como default el schema
Expand All @@ -81,6 +82,7 @@ def __init__(self, catalog=None, schema_filename=None, schema_dir=None,
"""
self.verify_ssl = verify_ssl
self.requests_timeout = requests_timeout
self.url_check_timeout = url_check_timeout
# se construye el objeto DataJson con la interfaz de un dicconario
if catalog:

Expand Down Expand Up @@ -245,14 +247,15 @@ def is_valid_catalog(self, catalog=None, broken_links=False):
Args:
catalog (str o dict): Catálogo (dict, JSON o XLSX) a ser validado.
Si no se pasa, valida este catálogo.
broken_links(bool): Activa el checkeo de estados de urls
Returns:
bool: True si el data.json cumple con el schema, sino False.
"""
catalog = self._read_catalog(catalog) if catalog else self
return self.validator.is_valid(catalog,
broken_links=broken_links,
verify_ssl=self.verify_ssl)
return self.validator.is_valid(
catalog, broken_links=broken_links, verify_ssl=self.verify_ssl,
url_check_timeout=self.url_check_timeout)

@staticmethod
def _update_validation_response(error, response):
Expand Down Expand Up @@ -344,10 +347,9 @@ def validate_catalog(self, catalog=None, only_errors=False, fmt="dict",
"""
catalog = self._read_catalog(catalog) if catalog else self

validation = self.validator.validate_catalog(catalog,
only_errors,
broken_links,
self.verify_ssl)
validation = self.validator.validate_catalog(
catalog, only_errors, broken_links, self.verify_ssl,
self.url_check_timeout)
if export_path:
fmt = 'table'

Expand Down Expand Up @@ -968,7 +970,8 @@ def generate_catalogs_indicators(self, catalogs=None,
return indicators.generate_catalogs_indicators(
catalogs, central_catalog, identifier_search=identifier_search,
validator=self.validator, broken_links=broken_links,
verify_ssl=self.verify_ssl)
verify_ssl=self.verify_ssl,
url_check_timeout=self.url_check_timeout)

def _count_fields_recursive(self, dataset, fields):
"""Cuenta la información de campos optativos/recomendados/requeridos
Expand Down
24 changes: 14 additions & 10 deletions pydatajson/indicators.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def generate_catalogs_indicators(catalogs, central_catalog=None,
identifier_search=False,
broken_links=False,
validator=None,
verify_ssl=True):
verify_ssl=True,
url_check_timeout=1):
"""Genera una lista de diccionarios con varios indicadores sobre
los catálogos provistos, tales como la cantidad de datasets válidos,
días desde su última fecha actualizada, entre otros.
Expand Down Expand Up @@ -85,7 +86,8 @@ def generate_catalogs_indicators(catalogs, central_catalog=None,

fields_count, result = _generate_indicators(
catalog, validator=validator,
broken_links=broken_links, verify_ssl=verify_ssl)
broken_links=broken_links, verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout)
if central_catalog:
result.update(_federation_indicators(
catalog, central_catalog, identifier_search=identifier_search))
Expand Down Expand Up @@ -115,7 +117,8 @@ def generate_catalogs_indicators(catalogs, central_catalog=None,


def _generate_indicators(catalog, validator=None, only_numeric=False,
broken_links=False, verify_ssl=True):
broken_links=False, verify_ssl=True,
url_check_timeout=1):
"""Genera los indicadores de un catálogo individual.
Args:
Expand All @@ -133,9 +136,9 @@ def _generate_indicators(catalog, validator=None, only_numeric=False,

# Genero indicadores relacionados con validacion de urls
if broken_links:
result.update(_generate_valid_urls_indicators(catalog,
validator=validator,
verify_ssl=verify_ssl))
result.update(_generate_valid_urls_indicators(
catalog, validator=validator, verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout))

# Genero los indicadores relacionados con fechas, y los agrego
result.update(
Expand Down Expand Up @@ -562,7 +565,8 @@ def _eventual_periodicity(periodicity):
return periodicity in ('eventual', 'EVENTUAL')


def _generate_valid_urls_indicators(catalog, validator=None, verify_ssl=True):
def _generate_valid_urls_indicators(catalog, validator=None, verify_ssl=True,
url_check_timeout=1):
"""Genera indicadores sobre el estado de las urls de distribuciones
Args:
Expand All @@ -575,9 +579,9 @@ def _generate_valid_urls_indicators(catalog, validator=None, verify_ssl=True):
result = {}
try:
generator = \
StatusIndicatorsGenerator(catalog,
validator=validator,
verify_ssl=verify_ssl)
StatusIndicatorsGenerator(
catalog, validator=validator, verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout)
except Exception as e:
msg = u'Error generando resumen del catálogo {}: {}'.format(
catalog['title'], str(e))
Expand Down
9 changes: 4 additions & 5 deletions pydatajson/reporting.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
from .validation import validate_catalog


def generate_datasets_summary(catalog, export_path=None,
validator=None, verify_ssl=True):
def generate_datasets_summary(catalog, export_path=None, validator=None,
verify_ssl=True, url_check_timeout=1):
"""Genera un informe sobre los datasets presentes en un catálogo,
indicando para cada uno:
- Índice en la lista catalog["dataset"]
Expand Down Expand Up @@ -53,9 +53,8 @@ def generate_datasets_summary(catalog, export_path=None,
datasets = []

validation = validate_catalog(
catalog,
validator=validator,
verify_ssl=verify_ssl)["error"]["dataset"]
catalog, validator=validator, verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout)["error"]["dataset"]

def info_dataset(index, dataset):
"""Recolecta información básica de un dataset."""
Expand Down
6 changes: 4 additions & 2 deletions pydatajson/status_indicators_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@

class StatusIndicatorsGenerator(object):

def __init__(self, catalog, validator=None, verify_ssl=True):
def __init__(self, catalog, validator=None, verify_ssl=True,
url_check_timeout=1):
self.download_url_ok = None
self.catalog = read_catalog(catalog)
self.summary = generate_datasets_summary(self.catalog,
validator=validator,
verify_ssl=verify_ssl)
self.verify_url = verify_ssl
self.url_check_timeout = url_check_timeout

def datasets_cant(self):
return len(self.summary)
Expand Down Expand Up @@ -45,7 +47,7 @@ def distribuciones_download_url_ok_cant(self):
if self.download_url_ok:
return self.download_url_ok
validator = DistributionDownloadUrlsValidator(
self.catalog, self.verify_url)
self.catalog, self.verify_url, self.url_check_timeout)
self.download_url_ok = validator.validate()
return self.download_url_ok

Expand Down
43 changes: 28 additions & 15 deletions pydatajson/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,21 @@ def init_jsonschema_validator(self, schema_dir, schema_filename):
return jsonschema.Draft4Validator(
schema=schema, resolver=resolver, format_checker=format_checker)

def is_valid(self, catalog, broken_links=False, verify_ssl=True):
def is_valid(self, catalog, broken_links=False, verify_ssl=True,
url_check_timeout=1):
return not self._get_errors(catalog,
broken_links=broken_links,
verify_ssl=verify_ssl)
verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout)

def validate_catalog(self, catalog, only_errors=False,
broken_links=False, verify_ssl=True):
broken_links=False, verify_ssl=True,
url_check_timeout=1):

default_response = self._default_response(catalog)
errors = self._get_errors(catalog, broken_links=broken_links,
verify_ssl=verify_ssl)
verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout)

response = default_response.copy()
for error in errors:
Expand All @@ -77,14 +82,16 @@ def validate_catalog(self, catalog, only_errors=False,

return response

def _get_errors(self, catalog, broken_links=False, verify_ssl=True):
def _get_errors(self, catalog, broken_links=False, verify_ssl=True,
url_check_timeout=1):
errors = list(
self.jsonschema_validator.iter_errors(catalog)
)
try:
for error in self._custom_errors(catalog,
broken_links=broken_links,
verify_ssl=verify_ssl):
for error in self._custom_errors(
catalog, broken_links=broken_links,
verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout):
errors.append(error)
except Exception as e:
logger.warning("Error de validación")
Expand Down Expand Up @@ -116,16 +123,19 @@ def _default_response(self, catalog):
}

# noinspection PyTypeChecker
def _custom_errors(self, catalog, broken_links=False, verify_ssl=True):
def _custom_errors(self, catalog, broken_links=False, verify_ssl=True,
url_check_timeout=1):
"""Realiza validaciones sin usar el jsonschema.
En esta función se agregan bloques de código en python que realizan
validaciones complicadas o imposibles de especificar usando jsonschema
"""
validators = self._validators_for_catalog(catalog)
if broken_links:
validators.append(LandingPagesValidator(catalog, verify_ssl))
validators.append(DistributionUrlsValidator(catalog, verify_ssl))
validators.append(LandingPagesValidator(catalog, verify_ssl,
url_check_timeout))
validators.append(DistributionUrlsValidator(catalog, verify_ssl,
url_check_timeout))

for validator in validators:
for error in validator.validate():
Expand Down Expand Up @@ -175,7 +185,8 @@ def _update_validation_response(self, error, response):
return new_response


def is_valid_catalog(catalog, validator=None, verify_ssl=True):
def is_valid_catalog(catalog, validator=None, verify_ssl=True,
url_check_timeout=1):
"""Valida que un archivo `data.json` cumpla con el schema definido.
Chequea que el data.json tiene todos los campos obligatorios y que
Expand All @@ -195,12 +206,13 @@ def is_valid_catalog(catalog, validator=None, verify_ssl=True):
else:
validator = Validator()

return validator.is_valid(catalog, verify_ssl=verify_ssl)
return validator.is_valid(catalog, verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout)


def validate_catalog(catalog, only_errors=False, fmt="dict",
export_path=None, validator=None,
verify_ssl=True):
verify_ssl=True, url_check_timeout=1):
"""Analiza un data.json registrando los errores que encuentra.
Chequea que el data.json tiene todos los campos obligatorios y que
Expand Down Expand Up @@ -260,4 +272,5 @@ def validate_catalog(catalog, only_errors=False, fmt="dict",

return validator.validate_catalog(catalog,
only_errors,
verify_ssl=verify_ssl)
verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout)
7 changes: 5 additions & 2 deletions pydatajson/validators/url_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,19 @@

class UrlValidator(SimpleValidator):

def __init__(self, catalog, verify_ssl):
def __init__(self, catalog, verify_ssl, url_check_timeout):
super(UrlValidator, self).__init__(catalog)
self.verify_ssl = verify_ssl
self.url_check_timeout = url_check_timeout

def validate(self):
raise NotImplementedError

def is_working_url(self, url):
try:
response = requests.head(url, timeout=1, verify=self.verify_ssl)
response = requests.head(url,
timeout=self.url_check_timeout,
verify=self.verify_ssl)
matches = []
if response.status_code not in EXCEPTION_STATUS_CODES:
matches = \
Expand Down
24 changes: 12 additions & 12 deletions tests/test_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,27 +46,27 @@ def setUp(self):
self.tinr_validator = \
ThemeIdsNotRepeatedValidator(self.catalog)
self.ddu_validator = \
DistributionUrlsValidator(self.catalog, True)
DistributionUrlsValidator(self.catalog, True, 1)
self.lp_validator = \
LandingPagesValidator(self.catalog, True)
LandingPagesValidator(self.catalog, True, 1)

@requests_mock.Mocker()
def test_is_working_url_valid_url(self, req_mock):
url_validator = UrlValidator(self.catalog, True)
url_validator = UrlValidator(self.catalog, True, 1)
req_mock.head(self.test_url)
self.assertEqual(
(True, 200), url_validator.is_working_url(self.test_url))

@requests_mock.Mocker()
def test_is_working_url_invalid_url(self, req_mock):
url_validator = UrlValidator(self.catalog, True)
url_validator = UrlValidator(self.catalog, True, 1)
req_mock.head(self.test_url, status_code=400)
self.assertEqual(
(False, 400), url_validator.is_working_url(self.test_url))

@requests_mock.Mocker()
def test_is_working_url_too_many_requests_response(self, req_mock):
url_validator = UrlValidator(self.catalog, True)
url_validator = UrlValidator(self.catalog, True, 1)
too_many_request_status_code = 429
req_mock.head(self.test_url,
status_code=too_many_request_status_code)
Expand All @@ -76,20 +76,20 @@ def test_is_working_url_too_many_requests_response(self, req_mock):

@requests_mock.Mocker()
def test_is_working_url_url_with_exception(self, req_mock):
url_validator = UrlValidator(self.catalog, True)
url_validator = UrlValidator(self.catalog, True, 1)
req_mock.head(self.test_url, exc=ConnectionError)
self.assertEqual(
(False, None), url_validator.is_working_url(self.test_url))

@requests_mock.Mocker()
def test_is_working_url_url_with_timeout(self, req_mock):
url_validator = UrlValidator(self.catalog, True)
url_validator = UrlValidator(self.catalog, True, 1)
req_mock.head(self.test_url, exc=Timeout)
self.assertEqual(
(False, 408), url_validator.is_working_url(self.test_url))

def test_is_working_url_malformed_values(self):
url_validator = UrlValidator(self.catalog, True)
url_validator = UrlValidator(self.catalog, True, 1)
self.assertEqual(
(False, None), url_validator.is_working_url('malformed_value'))
self.assertEqual(
Expand All @@ -99,7 +99,7 @@ def test_is_working_url_malformed_values(self):

def test_valid_landing_page_validator(self):
lp_validator = \
LandingPagesValidator(self.catalog, True)
LandingPagesValidator(self.catalog, True, 1)
with mock.patch(
'pydatajson'
'.validators'
Expand All @@ -110,7 +110,7 @@ def test_valid_landing_page_validator(self):

def test_invalid_landing_page_validator(self):
lp_validator = \
LandingPagesValidator(self.catalog, True)
LandingPagesValidator(self.catalog, True, 1)
with mock.patch(
'pydatajson'
'.validators'
Expand All @@ -121,7 +121,7 @@ def test_invalid_landing_page_validator(self):

def test_valid_distribution_url_validator(self):
ddu_validator = \
DistributionUrlsValidator(self.catalog, True)
DistributionUrlsValidator(self.catalog, True, 1)
with mock.patch(
'pydatajson'
'.validators'
Expand All @@ -132,7 +132,7 @@ def test_valid_distribution_url_validator(self):

def test_invalid_distribution_url_validator(self):
ddu_validator = \
DistributionUrlsValidator(self.catalog, True)
DistributionUrlsValidator(self.catalog, True, 1)
with mock.patch(
'pydatajson'
'.validators'
Expand Down

0 comments on commit 08837b6

Please sign in to comment.