Skip to content

Commit

Permalink
Merge ddd552d into b1ba69e
Browse files Browse the repository at this point in the history
  • Loading branch information
lrromero committed Oct 28, 2019
2 parents b1ba69e + ddd552d commit e5386cb
Show file tree
Hide file tree
Showing 9 changed files with 103 additions and 55 deletions.
1 change: 1 addition & 0 deletions pydatajson/constants.py
Expand Up @@ -7,3 +7,4 @@
EXCEPTION_STATUS_CODES = [429]

CANT_THREADS_BROKEN_URL_VALIDATOR = 10
DEFAULT_CHECK_TIMEOUT = 1
21 changes: 12 additions & 9 deletions pydatajson/core.py
Expand Up @@ -54,7 +54,8 @@ class DataJson(dict):
def __init__(self, catalog=None, schema_filename=None, schema_dir=None,
default_values=None, catalog_format=None,
validator_class=Validator, verify_ssl=False,
requests_timeout=constants.REQUESTS_TIMEOUT):
requests_timeout=constants.REQUESTS_TIMEOUT,
url_check_timeout=constants.DEFAULT_CHECK_TIMEOUT):
"""Lee un catálogo y crea un objeto con funciones para manipularlo.
Salvo que se indique lo contrario, se utiliza como default el schema
Expand All @@ -81,6 +82,7 @@ def __init__(self, catalog=None, schema_filename=None, schema_dir=None,
"""
self.verify_ssl = verify_ssl
self.requests_timeout = requests_timeout
self.url_check_timeout = url_check_timeout
# se construye el objeto DataJson con la interfaz de un dicconario
if catalog:

Expand Down Expand Up @@ -245,14 +247,15 @@ def is_valid_catalog(self, catalog=None, broken_links=False):
Args:
catalog (str o dict): Catálogo (dict, JSON o XLSX) a ser validado.
Si no se pasa, valida este catálogo.
broken_links(bool): Activa el checkeo de estados de urls
Returns:
bool: True si el data.json cumple con el schema, sino False.
"""
catalog = self._read_catalog(catalog) if catalog else self
return self.validator.is_valid(catalog,
broken_links=broken_links,
verify_ssl=self.verify_ssl)
return self.validator.is_valid(
catalog, broken_links=broken_links, verify_ssl=self.verify_ssl,
url_check_timeout=self.url_check_timeout)

@staticmethod
def _update_validation_response(error, response):
Expand Down Expand Up @@ -344,10 +347,9 @@ def validate_catalog(self, catalog=None, only_errors=False, fmt="dict",
"""
catalog = self._read_catalog(catalog) if catalog else self

validation = self.validator.validate_catalog(catalog,
only_errors,
broken_links,
self.verify_ssl)
validation = self.validator.validate_catalog(
catalog, only_errors, broken_links, self.verify_ssl,
self.url_check_timeout)
if export_path:
fmt = 'table'

Expand Down Expand Up @@ -968,7 +970,8 @@ def generate_catalogs_indicators(self, catalogs=None,
return indicators.generate_catalogs_indicators(
catalogs, central_catalog, identifier_search=identifier_search,
validator=self.validator, broken_links=broken_links,
verify_ssl=self.verify_ssl)
verify_ssl=self.verify_ssl,
url_check_timeout=self.url_check_timeout)

def _count_fields_recursive(self, dataset, fields):
"""Cuenta la información de campos optativos/recomendados/requeridos
Expand Down
24 changes: 14 additions & 10 deletions pydatajson/indicators.py
Expand Up @@ -47,7 +47,8 @@ def generate_catalogs_indicators(catalogs, central_catalog=None,
identifier_search=False,
broken_links=False,
validator=None,
verify_ssl=True):
verify_ssl=True,
url_check_timeout=1):
"""Genera una lista de diccionarios con varios indicadores sobre
los catálogos provistos, tales como la cantidad de datasets válidos,
días desde su última fecha actualizada, entre otros.
Expand Down Expand Up @@ -85,7 +86,8 @@ def generate_catalogs_indicators(catalogs, central_catalog=None,

fields_count, result = _generate_indicators(
catalog, validator=validator,
broken_links=broken_links, verify_ssl=verify_ssl)
broken_links=broken_links, verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout)
if central_catalog:
result.update(_federation_indicators(
catalog, central_catalog, identifier_search=identifier_search))
Expand Down Expand Up @@ -115,7 +117,8 @@ def generate_catalogs_indicators(catalogs, central_catalog=None,


def _generate_indicators(catalog, validator=None, only_numeric=False,
broken_links=False, verify_ssl=True):
broken_links=False, verify_ssl=True,
url_check_timeout=1):
"""Genera los indicadores de un catálogo individual.
Args:
Expand All @@ -133,9 +136,9 @@ def _generate_indicators(catalog, validator=None, only_numeric=False,

# Genero indicadores relacionados con validacion de urls
if broken_links:
result.update(_generate_valid_urls_indicators(catalog,
validator=validator,
verify_ssl=verify_ssl))
result.update(_generate_valid_urls_indicators(
catalog, validator=validator, verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout))

# Genero los indicadores relacionados con fechas, y los agrego
result.update(
Expand Down Expand Up @@ -562,7 +565,8 @@ def _eventual_periodicity(periodicity):
return periodicity in ('eventual', 'EVENTUAL')


def _generate_valid_urls_indicators(catalog, validator=None, verify_ssl=True):
def _generate_valid_urls_indicators(catalog, validator=None, verify_ssl=True,
url_check_timeout=1):
"""Genera indicadores sobre el estado de las urls de distribuciones
Args:
Expand All @@ -575,9 +579,9 @@ def _generate_valid_urls_indicators(catalog, validator=None, verify_ssl=True):
result = {}
try:
generator = \
StatusIndicatorsGenerator(catalog,
validator=validator,
verify_ssl=verify_ssl)
StatusIndicatorsGenerator(
catalog, validator=validator, verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout)
except Exception as e:
msg = u'Error generando resumen del catálogo {}: {}'.format(
catalog['title'], str(e))
Expand Down
9 changes: 4 additions & 5 deletions pydatajson/reporting.py
Expand Up @@ -17,8 +17,8 @@
from .validation import validate_catalog


def generate_datasets_summary(catalog, export_path=None,
validator=None, verify_ssl=True):
def generate_datasets_summary(catalog, export_path=None, validator=None,
verify_ssl=True, url_check_timeout=1):
"""Genera un informe sobre los datasets presentes en un catálogo,
indicando para cada uno:
- Índice en la lista catalog["dataset"]
Expand Down Expand Up @@ -53,9 +53,8 @@ def generate_datasets_summary(catalog, export_path=None,
datasets = []

validation = validate_catalog(
catalog,
validator=validator,
verify_ssl=verify_ssl)["error"]["dataset"]
catalog, validator=validator, verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout)["error"]["dataset"]

def info_dataset(index, dataset):
"""Recolecta información básica de un dataset."""
Expand Down
6 changes: 4 additions & 2 deletions pydatajson/status_indicators_generator.py
Expand Up @@ -9,13 +9,15 @@

class StatusIndicatorsGenerator(object):

def __init__(self, catalog, validator=None, verify_ssl=True):
def __init__(self, catalog, validator=None, verify_ssl=True,
url_check_timeout=1):
self.download_url_ok = None
self.catalog = read_catalog(catalog)
self.summary = generate_datasets_summary(self.catalog,
validator=validator,
verify_ssl=verify_ssl)
self.verify_url = verify_ssl
self.url_check_timeout = url_check_timeout

def datasets_cant(self):
return len(self.summary)
Expand Down Expand Up @@ -45,7 +47,7 @@ def distribuciones_download_url_ok_cant(self):
if self.download_url_ok:
return self.download_url_ok
validator = DistributionDownloadUrlsValidator(
self.catalog, self.verify_url)
self.catalog, self.verify_url, self.url_check_timeout)
self.download_url_ok = validator.validate()
return self.download_url_ok

Expand Down
43 changes: 28 additions & 15 deletions pydatajson/validation.py
Expand Up @@ -53,16 +53,21 @@ def init_jsonschema_validator(self, schema_dir, schema_filename):
return jsonschema.Draft4Validator(
schema=schema, resolver=resolver, format_checker=format_checker)

def is_valid(self, catalog, broken_links=False, verify_ssl=True):
def is_valid(self, catalog, broken_links=False, verify_ssl=True,
url_check_timeout=1):
return not self._get_errors(catalog,
broken_links=broken_links,
verify_ssl=verify_ssl)
verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout)

def validate_catalog(self, catalog, only_errors=False,
broken_links=False, verify_ssl=True):
broken_links=False, verify_ssl=True,
url_check_timeout=1):

default_response = self._default_response(catalog)
errors = self._get_errors(catalog, broken_links=broken_links,
verify_ssl=verify_ssl)
verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout)

response = default_response.copy()
for error in errors:
Expand All @@ -77,14 +82,16 @@ def validate_catalog(self, catalog, only_errors=False,

return response

def _get_errors(self, catalog, broken_links=False, verify_ssl=True):
def _get_errors(self, catalog, broken_links=False, verify_ssl=True,
url_check_timeout=1):
errors = list(
self.jsonschema_validator.iter_errors(catalog)
)
try:
for error in self._custom_errors(catalog,
broken_links=broken_links,
verify_ssl=verify_ssl):
for error in self._custom_errors(
catalog, broken_links=broken_links,
verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout):
errors.append(error)
except Exception as e:
logger.warning("Error de validación")
Expand Down Expand Up @@ -116,16 +123,19 @@ def _default_response(self, catalog):
}

# noinspection PyTypeChecker
def _custom_errors(self, catalog, broken_links=False, verify_ssl=True):
def _custom_errors(self, catalog, broken_links=False, verify_ssl=True,
url_check_timeout=1):
"""Realiza validaciones sin usar el jsonschema.
En esta función se agregan bloques de código en python que realizan
validaciones complicadas o imposibles de especificar usando jsonschema
"""
validators = self._validators_for_catalog(catalog)
if broken_links:
validators.append(LandingPagesValidator(catalog, verify_ssl))
validators.append(DistributionUrlsValidator(catalog, verify_ssl))
validators.append(LandingPagesValidator(catalog, verify_ssl,
url_check_timeout))
validators.append(DistributionUrlsValidator(catalog, verify_ssl,
url_check_timeout))

for validator in validators:
for error in validator.validate():
Expand Down Expand Up @@ -175,7 +185,8 @@ def _update_validation_response(self, error, response):
return new_response


def is_valid_catalog(catalog, validator=None, verify_ssl=True):
def is_valid_catalog(catalog, validator=None, verify_ssl=True,
url_check_timeout=1):
"""Valida que un archivo `data.json` cumpla con el schema definido.
Chequea que el data.json tiene todos los campos obligatorios y que
Expand All @@ -195,12 +206,13 @@ def is_valid_catalog(catalog, validator=None, verify_ssl=True):
else:
validator = Validator()

return validator.is_valid(catalog, verify_ssl=verify_ssl)
return validator.is_valid(catalog, verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout)


def validate_catalog(catalog, only_errors=False, fmt="dict",
export_path=None, validator=None,
verify_ssl=True):
verify_ssl=True, url_check_timeout=1):
"""Analiza un data.json registrando los errores que encuentra.
Chequea que el data.json tiene todos los campos obligatorios y que
Expand Down Expand Up @@ -260,4 +272,5 @@ def validate_catalog(catalog, only_errors=False, fmt="dict",

return validator.validate_catalog(catalog,
only_errors,
verify_ssl=verify_ssl)
verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout)
7 changes: 5 additions & 2 deletions pydatajson/validators/url_validator.py
Expand Up @@ -12,16 +12,19 @@

class UrlValidator(SimpleValidator):

def __init__(self, catalog, verify_ssl):
def __init__(self, catalog, verify_ssl, url_check_timeout):
super(UrlValidator, self).__init__(catalog)
self.verify_ssl = verify_ssl
self.url_check_timeout = url_check_timeout

def validate(self):
raise NotImplementedError

def is_working_url(self, url):
try:
response = requests.head(url, timeout=1, verify=self.verify_ssl)
response = requests.head(url,
timeout=self.url_check_timeout,
verify=self.verify_ssl)
matches = []
if response.status_code not in EXCEPTION_STATUS_CODES:
matches = \
Expand Down
16 changes: 16 additions & 0 deletions tests/test_status_indicators_generator.py
Expand Up @@ -162,3 +162,19 @@ def test_full_distribuciones_download_url_ok_pct(self, req_mock):
def test_empty_distribuciones_download_url_ok_pct(self):
self.assertEqual(
None, self.gen_empty.distribuciones_download_url_ok_pct())

@requests_mock.Mocker()
def test_check_url_default_timeout(self, req_mock):
req_mock.head(requests_mock.ANY, text='resp')
self.gen_justicia.distribuciones_download_url_ok_pct()
for request in req_mock.request_history:
self.assertEqual(1, request.timeout)

@requests_mock.Mocker()
def test_check_url_override_timeout(self, req_mock):
generator = StatusIndicatorsGenerator(
self.get_sample('catalogo_justicia.json'), url_check_timeout=10)
req_mock.head(requests_mock.ANY, text='resp')
generator.distribuciones_download_url_ok_pct()
for request in req_mock.request_history:
self.assertEqual(10, request.timeout)

0 comments on commit e5386cb

Please sign in to comment.