Skip to content

Commit

Permalink
Merge pull request #310 from datosgobar/304-parametrizar-broken-links…
Browse files Browse the repository at this point in the history
…-threads

Parametrizo cantidad de threads en validaciones de urls
  • Loading branch information
FScaccheri committed Nov 28, 2019
2 parents 3bb6317 + 4ba6f38 commit 8086863
Show file tree
Hide file tree
Showing 13 changed files with 51 additions and 49 deletions.
1 change: 0 additions & 1 deletion pydatajson/constants.py
Expand Up @@ -6,5 +6,4 @@
INVALID_STATUS_CODES_REGEX = ["^4[0-9]+$", "^5[0-9]+$"]
EXCEPTION_STATUS_CODES = [429]

CANT_THREADS_BROKEN_URL_VALIDATOR = 10
DEFAULT_CHECK_TIMEOUT = 1
6 changes: 4 additions & 2 deletions pydatajson/core.py
Expand Up @@ -965,13 +965,15 @@ def _extract_datasets_to_harvest(cls, report):
def generate_catalogs_indicators(self, catalogs=None,
central_catalog=None,
identifier_search=False,
broken_links=False):
broken_links=False,
broken_links_threads=1):
catalogs = catalogs or self
return indicators.generate_catalogs_indicators(
catalogs, central_catalog, identifier_search=identifier_search,
validator=self.validator, broken_links=broken_links,
verify_ssl=self.verify_ssl,
url_check_timeout=self.url_check_timeout)
url_check_timeout=self.url_check_timeout,
broken_links_threads=broken_links_threads)

def _count_fields_recursive(self, dataset, fields):
"""Cuenta la información de campos optativos/recomendados/requeridos
Expand Down
16 changes: 10 additions & 6 deletions pydatajson/indicators.py
Expand Up @@ -48,7 +48,8 @@ def generate_catalogs_indicators(catalogs, central_catalog=None,
broken_links=False,
validator=None,
verify_ssl=True,
url_check_timeout=1):
url_check_timeout=1,
broken_links_threads=1):
"""Genera una lista de diccionarios con varios indicadores sobre
los catálogos provistos, tales como la cantidad de datasets válidos,
días desde su última fecha actualizada, entre otros.
Expand Down Expand Up @@ -87,7 +88,8 @@ def generate_catalogs_indicators(catalogs, central_catalog=None,
fields_count, result = _generate_indicators(
catalog, validator=validator,
broken_links=broken_links, verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout)
url_check_timeout=url_check_timeout,
broken_links_threads=broken_links_threads)
if central_catalog:
result.update(_federation_indicators(
catalog, central_catalog, identifier_search=identifier_search))
Expand Down Expand Up @@ -118,7 +120,7 @@ def generate_catalogs_indicators(catalogs, central_catalog=None,

def _generate_indicators(catalog, validator=None, only_numeric=False,
broken_links=False, verify_ssl=True,
url_check_timeout=1):
url_check_timeout=1, broken_links_threads=1):
"""Genera los indicadores de un catálogo individual.
Args:
Expand All @@ -138,7 +140,8 @@ def _generate_indicators(catalog, validator=None, only_numeric=False,
if broken_links:
result.update(_generate_valid_urls_indicators(
catalog, validator=validator, verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout))
url_check_timeout=url_check_timeout,
threads_count=broken_links_threads))

# Genero los indicadores relacionados con fechas, y los agrego
result.update(
Expand Down Expand Up @@ -566,7 +569,7 @@ def _eventual_periodicity(periodicity):


def _generate_valid_urls_indicators(catalog, validator=None, verify_ssl=True,
url_check_timeout=1):
url_check_timeout=1, threads_count=1):
"""Genera indicadores sobre el estado de las urls de distribuciones
Args:
Expand All @@ -581,7 +584,8 @@ def _generate_valid_urls_indicators(catalog, validator=None, verify_ssl=True,
generator = \
StatusIndicatorsGenerator(
catalog, validator=validator, verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout)
url_check_timeout=url_check_timeout,
threads_count=threads_count)
except Exception as e:
msg = u'Error generando resumen del catálogo {}: {}'.format(
catalog['title'], str(e))
Expand Down
6 changes: 4 additions & 2 deletions pydatajson/status_indicators_generator.py
Expand Up @@ -10,14 +10,15 @@
class StatusIndicatorsGenerator(object):

def __init__(self, catalog, validator=None, verify_ssl=True,
url_check_timeout=1):
url_check_timeout=1, threads_count=1):
self.download_url_ok = None
self.catalog = read_catalog(catalog)
self.summary = generate_datasets_summary(self.catalog,
validator=validator,
verify_ssl=verify_ssl)
self.verify_url = verify_ssl
self.url_check_timeout = url_check_timeout
self.threads_count = threads_count

def datasets_cant(self):
return len(self.summary)
Expand Down Expand Up @@ -47,7 +48,8 @@ def distribuciones_download_url_ok_cant(self):
if self.download_url_ok:
return self.download_url_ok
validator = DistributionDownloadUrlsValidator(
self.catalog, self.verify_url, self.url_check_timeout)
self.catalog, self.verify_url, self.url_check_timeout,
self.threads_count)
self.download_url_ok = validator.validate()
return self.download_url_ok

Expand Down
19 changes: 12 additions & 7 deletions pydatajson/validation.py
Expand Up @@ -54,11 +54,12 @@ def init_jsonschema_validator(self, schema_dir, schema_filename):
schema=schema, resolver=resolver, format_checker=format_checker)

def is_valid(self, catalog, broken_links=False, verify_ssl=True,
url_check_timeout=1):
url_check_timeout=1, broken_links_threads=1):
return not self._get_errors(catalog,
broken_links=broken_links,
verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout)
url_check_timeout=url_check_timeout,
broken_links_threads=broken_links_threads)

def validate_catalog(self, catalog, only_errors=False,
broken_links=False, verify_ssl=True,
Expand All @@ -83,17 +84,19 @@ def validate_catalog(self, catalog, only_errors=False,
return response

def _get_errors(self, catalog, broken_links=False, verify_ssl=True,
url_check_timeout=1):
url_check_timeout=1, broken_links_threads=1):
errors = list(
self.jsonschema_validator.iter_errors(catalog)
)
try:
for error in self._custom_errors(
catalog, broken_links=broken_links,
verify_ssl=verify_ssl,
url_check_timeout=url_check_timeout):
url_check_timeout=url_check_timeout,
broken_links_threads=broken_links_threads):
errors.append(error)
except Exception as e:
print(e)
logger.warning("Error de validación")
return errors

Expand Down Expand Up @@ -124,7 +127,7 @@ def _default_response(self, catalog):

# noinspection PyTypeChecker
def _custom_errors(self, catalog, broken_links=False, verify_ssl=True,
url_check_timeout=1):
url_check_timeout=1, broken_links_threads=1):
"""Realiza validaciones sin usar el jsonschema.
En esta función se agregan bloques de código en python que realizan
Expand All @@ -133,9 +136,11 @@ def _custom_errors(self, catalog, broken_links=False, verify_ssl=True,
validators = self._validators_for_catalog(catalog)
if broken_links:
validators.append(LandingPagesValidator(catalog, verify_ssl,
url_check_timeout))
url_check_timeout,
broken_links_threads))
validators.append(DistributionUrlsValidator(catalog, verify_ssl,
url_check_timeout))
url_check_timeout,
broken_links_threads))

for validator in validators:
for error in validator.validate():
Expand Down
4 changes: 2 additions & 2 deletions pydatajson/validators/distribution_download_urls_validator.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from pydatajson import threading_helper, constants
from pydatajson import threading_helper
from pydatajson.validators.url_validator import UrlValidator


Expand All @@ -15,7 +15,7 @@ def validate(self):
async_results += threading_helper \
.apply_threading(distribution_urls,
self.is_working_url,
constants.CANT_THREADS_BROKEN_URL_VALIDATOR)
self.threads_count)

result = 0
for res, _ in async_results:
Expand Down
4 changes: 2 additions & 2 deletions pydatajson/validators/distribution_urls_validator.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pydatajson.custom_exceptions as ce
from pydatajson import threading_helper, constants
from pydatajson import threading_helper
from pydatajson.validators.url_validator import UrlValidator


Expand Down Expand Up @@ -30,7 +30,7 @@ def validate(self):
sync_res = threading_helper \
.apply_threading(urls,
self.is_working_url,
constants.CANT_THREADS_BROKEN_URL_VALIDATOR)
self.threads_count)

for i in range(len(metadata)):
actual_metadata = metadata[i]
Expand Down
4 changes: 2 additions & 2 deletions pydatajson/validators/landing_pages_validator.py
Expand Up @@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-

import pydatajson.custom_exceptions as ce
from pydatajson import threading_helper, constants
from pydatajson import threading_helper
from pydatajson.validators.url_validator import UrlValidator


Expand All @@ -26,7 +26,7 @@ def validate(self):
sync_res = threading_helper \
.apply_threading(urls,
self.is_working_url,
constants.CANT_THREADS_BROKEN_URL_VALIDATOR)
self.threads_count)

for i in range(len(sync_res)):
valid, status_code = sync_res[i]
Expand Down
3 changes: 2 additions & 1 deletion pydatajson/validators/url_validator.py
Expand Up @@ -12,10 +12,11 @@

class UrlValidator(SimpleValidator):

def __init__(self, catalog, verify_ssl, url_check_timeout):
def __init__(self, catalog, verify_ssl, url_check_timeout, threads_count):
super(UrlValidator, self).__init__(catalog)
self.verify_ssl = verify_ssl
self.url_check_timeout = url_check_timeout
self.threads_count = threads_count

def validate(self):
raise NotImplementedError
Expand Down
2 changes: 0 additions & 2 deletions tests/test_catalog_readme.py
Expand Up @@ -20,8 +20,6 @@
from pydatajson.catalog_readme import generate_readme
from tests.support.decorators import RESULTS_DIR

import pydatajson.constants
pydatajson.constants.CANT_THREADS_BROKEN_URL_VALIDATOR = 1

my_vcr = vcr.VCR(path_transformer=vcr.VCR.ensure_suffix('.yaml'),
cassette_library_dir=os.path.join("tests", "cassetes"),
Expand Down
3 changes: 0 additions & 3 deletions tests/test_readers_and_writers.py
Expand Up @@ -12,7 +12,6 @@
import nose
import vcr

from pydatajson import constants
from tests.support.factories.xlsx import CSV_TABLE, WRITE_XLSX_TABLE
from tests.support.factories.xlsx import READ_XLSX_TABLE

Expand All @@ -28,8 +27,6 @@
from tests import xl_methods
import openpyxl as pyxl

import pydatajson.constants
pydatajson.constants.CANT_THREADS_BROKEN_URL_VALIDATOR = 1

my_vcr = vcr.VCR(
path_transformer=vcr.VCR.ensure_suffix('.yaml'),
Expand Down
2 changes: 0 additions & 2 deletions tests/test_validation.py
Expand Up @@ -24,8 +24,6 @@
from .context import pydatajson
from .support.decorators import RESULTS_DIR

import pydatajson.constants
pydatajson.constants.CANT_THREADS_BROKEN_URL_VALIDATOR = 1

my_vcr = vcr.VCR(path_transformer=vcr.VCR.ensure_suffix('.yaml'),
cassette_library_dir=os.path.join("tests", "cassetes"),
Expand Down
30 changes: 13 additions & 17 deletions tests/test_validators.py
Expand Up @@ -25,10 +25,6 @@
except ImportError:
from unittest import mock

import pydatajson.constants

pydatajson.constants.CANT_THREADS_BROKEN_URL_VALIDATOR = 1


class ValidatorsTestCase(unittest.TestCase):
SAMPLES_DIR = os.path.join("tests", "samples")
Expand All @@ -46,27 +42,27 @@ def setUp(self):
self.tinr_validator = \
ThemeIdsNotRepeatedValidator(self.catalog)
self.ddu_validator = \
DistributionUrlsValidator(self.catalog, True, 1)
DistributionUrlsValidator(self.catalog, True, 1, 10)
self.lp_validator = \
LandingPagesValidator(self.catalog, True, 1)
LandingPagesValidator(self.catalog, True, 1, 10)

@requests_mock.Mocker()
def test_is_working_url_valid_url(self, req_mock):
url_validator = UrlValidator(self.catalog, True, 1)
url_validator = UrlValidator(self.catalog, True, 1, 10)
req_mock.head(self.test_url)
self.assertEqual(
(True, 200), url_validator.is_working_url(self.test_url))

@requests_mock.Mocker()
def test_is_working_url_invalid_url(self, req_mock):
url_validator = UrlValidator(self.catalog, True, 1)
url_validator = UrlValidator(self.catalog, True, 1, 10)
req_mock.head(self.test_url, status_code=400)
self.assertEqual(
(False, 400), url_validator.is_working_url(self.test_url))

@requests_mock.Mocker()
def test_is_working_url_too_many_requests_response(self, req_mock):
url_validator = UrlValidator(self.catalog, True, 1)
url_validator = UrlValidator(self.catalog, True, 1, 10)
too_many_request_status_code = 429
req_mock.head(self.test_url,
status_code=too_many_request_status_code)
Expand All @@ -76,20 +72,20 @@ def test_is_working_url_too_many_requests_response(self, req_mock):

@requests_mock.Mocker()
def test_is_working_url_url_with_exception(self, req_mock):
url_validator = UrlValidator(self.catalog, True, 1)
url_validator = UrlValidator(self.catalog, True, 1, 10)
req_mock.head(self.test_url, exc=ConnectionError)
self.assertEqual(
(False, None), url_validator.is_working_url(self.test_url))

@requests_mock.Mocker()
def test_is_working_url_url_with_timeout(self, req_mock):
url_validator = UrlValidator(self.catalog, True, 1)
url_validator = UrlValidator(self.catalog, True, 1, 10)
req_mock.head(self.test_url, exc=Timeout)
self.assertEqual(
(False, 408), url_validator.is_working_url(self.test_url))

def test_is_working_url_malformed_values(self):
url_validator = UrlValidator(self.catalog, True, 1)
url_validator = UrlValidator(self.catalog, True, 1, 10)
self.assertEqual(
(False, None), url_validator.is_working_url('malformed_value'))
self.assertEqual(
Expand All @@ -99,7 +95,7 @@ def test_is_working_url_malformed_values(self):

def test_valid_landing_page_validator(self):
lp_validator = \
LandingPagesValidator(self.catalog, True, 1)
LandingPagesValidator(self.catalog, True, 1, 10)
with mock.patch(
'pydatajson'
'.validators'
Expand All @@ -110,7 +106,7 @@ def test_valid_landing_page_validator(self):

def test_invalid_landing_page_validator(self):
lp_validator = \
LandingPagesValidator(self.catalog, True, 1)
LandingPagesValidator(self.catalog, True, 1, 10)
with mock.patch(
'pydatajson'
'.validators'
Expand All @@ -121,7 +117,7 @@ def test_invalid_landing_page_validator(self):

def test_valid_distribution_url_validator(self):
ddu_validator = \
DistributionUrlsValidator(self.catalog, True, 1)
DistributionUrlsValidator(self.catalog, True, 1, 10)
with mock.patch(
'pydatajson'
'.validators'
Expand All @@ -132,7 +128,7 @@ def test_valid_distribution_url_validator(self):

def test_invalid_distribution_url_validator(self):
ddu_validator = \
DistributionUrlsValidator(self.catalog, True, 1)
DistributionUrlsValidator(self.catalog, True, 1, 10)
with mock.patch(
'pydatajson'
'.validators'
Expand Down Expand Up @@ -191,7 +187,7 @@ def test_invalid_theme_ids_not_repeated_validator(self):

@requests_mock.Mocker()
def test_url_check_timeout(self, req_mock):
url_validator = UrlValidator(self.catalog, True, 100)
url_validator = UrlValidator(self.catalog, True, 100, 10)
req_mock.head(self.test_url)
url_validator.is_working_url(self.test_url)
self.assertEqual(100, req_mock.request_history[0].timeout)

0 comments on commit 8086863

Please sign in to comment.