Skip to content

Commit

Permalink
Merge pull request #281 from datosgobar/252-borken-links-validation
Browse files Browse the repository at this point in the history
252 broken links validation
  • Loading branch information
FScaccheri committed Sep 27, 2019
2 parents 19944ae + ba3458e commit ccddd6e
Show file tree
Hide file tree
Showing 14 changed files with 268 additions and 149 deletions.
51 changes: 50 additions & 1 deletion pydatajson/custom_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@

"""Excepciones personalizadas para validación y registro de errores"""

from __future__ import unicode_literals
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import with_statement

import os

try:
from urlparse import urlparse
except ImportError:
Expand Down Expand Up @@ -99,6 +101,53 @@ def __init__(self, dataset_idx, distribution_idx, distribution, attribute):
validator, message, validator_value, path)


class BrokenLandingPageError(BaseValidationError):

def __init__(self, dataset_idx, dataset_title, broken_url, status_code):

validator = "brokenLink"
message = "Dataset ({}) con 'landingPage' ({}) inválida ({})".format(
dataset_title, broken_url, status_code)
validator_value = "Chequea que la 'landingPage' devuelva un status " \
"code válido"
path = ['dataset', dataset_idx, 'landingPage']

super(BrokenLandingPageError, self).__init__(
validator, message, validator_value, path)


class BrokenAccessUrlError(BaseValidationError):

def __init__(self, dataset_idx, distribution_idx,
distribution_title, broken_url, status_code):
validator = "brokenLink"
message = "Distribution ({}) con 'accessUrl' ({}) inválida ({})".\
format(distribution_title, broken_url, status_code)
validator_value = "Chequea que el 'accessUrl' devuelva un status " \
"code válido"
path = ['dataset', dataset_idx, 'distribution', distribution_idx,
'accessUrl']

super(BrokenAccessUrlError, self).__init__(
validator, message, validator_value, path)


class BrokenDownloadUrlError(BaseValidationError):

def __init__(self, dataset_idx, distribution_idx, distribution_title,
broken_url, status_code):
validator = "brokenLink"
message = "Distribution ({}) con 'downloadUrl' ({}) inválida ({})".\
format(distribution_title, broken_url, status_code)
validator_value = "Chequea que el 'downloadUrl' devuelva un status " \
"code válido"
path = ['dataset', dataset_idx, 'distribution', distribution_idx,
'downloadUrl']

super(BrokenDownloadUrlError, self).__init__(
validator, message, validator_value, path)


class BaseUnexpectedValue(ValueError):

"""El id de una entidad está repetido en el catálogo."""
Expand Down
8 changes: 4 additions & 4 deletions pydatajson/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,7 @@ def fields_to_uppercase(fields):

def is_working_url(url):
try:
response = requests.head(url, timeout=3)
return response.status_code in VALID_STATUS_CODES
except RequestException:
return False
response = requests.head(url, timeout=1)
return response.status_code in VALID_STATUS_CODES, response.status_code
except (RequestException, Exception):
return False, None
2 changes: 1 addition & 1 deletion pydatajson/status_indicators_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def _validate_download_urls(self):
result = 0
for dataset in self.catalog.get('dataset', []):
for distribution in dataset.get('distribution', []):
valid = is_working_url(distribution.get('downloadURL', ''))
valid, _ = is_working_url(distribution.get('downloadURL', ''))
result += valid
# Guardo el resultado una vez calculado
self.download_url_ok = result
Expand Down
56 changes: 52 additions & 4 deletions pydatajson/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,17 @@
from __future__ import unicode_literals, print_function
from __future__ import with_statement, absolute_import

import logging
import mimetypes
import os
import platform
import mimetypes
import logging
from collections import Counter

import requests

from pydatajson.constants import VALID_STATUS_CODES
from pydatajson.helpers import is_working_url

try:
from urlparse import urlparse
except ImportError:
Expand Down Expand Up @@ -81,7 +86,7 @@ def _get_errors(self, catalog):
try:
for error in self._custom_errors(catalog):
errors.append(error)
except:
except Exception as e:
logger.warning("Error de validación")
return errors

Expand Down Expand Up @@ -125,7 +130,9 @@ def _custom_errors(self, catalog):
def _validators(self):
return [
self._theme_ids_not_repeated,
self._consistent_distribution_fields
self._consistent_distribution_fields,
self._validate_landing_pages,
self._validate_distributions_urls
]

def _theme_ids_not_repeated(self, catalog):
Expand Down Expand Up @@ -212,6 +219,47 @@ def _update_validation_response(self, error, response):

return new_response

def _validate_landing_pages(self, catalog):
datasets = catalog.get('dataset')
datasets = filter(lambda x: x.get('landingPage'), datasets)

for dataset_idx, dataset in enumerate(datasets):
dataset_title = dataset.get('title')
landing_page = dataset.get('landingPage')

valid, status_code = is_working_url(landing_page)
if not valid:
yield ce.BrokenLandingPageError(dataset_idx, dataset_title,
landing_page, status_code)

def _validate_distributions_urls(self, catalog):
datasets = catalog.get('dataset')

for dataset_idx, dataset in enumerate(datasets):
distributions = dataset.get('distribution')

for distribution_idx, distribution in enumerate(distributions):
distribution_title = distribution.get('title')
access_url = distribution.get('accessURL')
download_url = distribution.get('downloadURL')

access_url_is_valid, access_url_status_code = \
is_working_url(access_url)
download_url_is_valid, download_url_status_code = \
is_working_url(download_url)
if not access_url_is_valid:
yield ce.BrokenAccessUrlError(dataset_idx,
distribution_idx,
distribution_title,
access_url,
access_url_status_code)
if not download_url_is_valid:
yield ce.BrokenDownloadUrlError(dataset_idx,
distribution_idx,
distribution_title,
download_url,
download_url_status_code)


def is_valid_catalog(catalog, validator=None):
"""Valida que un archivo `data.json` cumpla con el schema definido.
Expand Down
2 changes: 1 addition & 1 deletion requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ pycallgraph
setuptools>=38.6
wheel>=0.31
vcrpy
requests_mock
requests-mock
10 changes: 5 additions & 5 deletions tests/results/mismatched_downloadURL_and_format.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
"status": "ERROR",
"error": {
"catalog": {
"status": "OK",
"errors": [],
"status": "OK",
"errors": [],
"title": "Datos Argentina"
},
},
"dataset": [
{
"status": "ERROR",
Expand All @@ -25,7 +25,7 @@
"error_code": 2,
"validator_value": "Chequea format y la extension del downloadURL"
}
],
],
"title": "Sistema de contrataciones electr\u00f3nicas"
},
{
Expand Down Expand Up @@ -58,7 +58,7 @@
"message": "distribution 'd_7d4d816f-3a40-476e-ab71-d48a3f0eb3ca' tiene distintas extensiones: format ('CSV') y downloadURL ('.puntos')",
"error_code": 2,
"validator_value": "Chequea format y la extension del downloadURL"
},
},
{
"instance": null,
"validator": "mismatchedValue",
Expand Down
7 changes: 4 additions & 3 deletions tests/results/multiple_missing_descriptions.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"instance": null,
"validator": "required",
"path": [],
"message": "u'description' is a required property",
"message": "'description' is a required property",
"error_code": 1,
"validator_value": [
"dataset",
Expand All @@ -34,7 +34,7 @@
"dataset",
0
],
"message": "u'description' is a required property",
"message": "'description' is a required property",
"error_code": 1,
"validator_value": [
"title",
Expand All @@ -43,7 +43,8 @@
"superTheme",
"distribution",
"accrualPeriodicity",
"issued"
"issued",
"identifier"
]
}
],
Expand Down
14 changes: 8 additions & 6 deletions tests/status_indicators_generator_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@ def get_sample(cls, sample_filename):

@classmethod
def setUpClass(cls):
cls.gen_justicia = StatusIndicatorsGenerator(
cls.get_sample('catalogo_justicia.json'))
cls.gen_full_data = StatusIndicatorsGenerator(
cls.get_sample('full_data.json'))
cls.gen_empty = StatusIndicatorsGenerator(
cls.get_sample('invalid_catalog_empty.json'))
with mock.patch('pydatajson.validation.is_working_url',
return_value=(True, 200)):
cls.gen_justicia = StatusIndicatorsGenerator(
cls.get_sample('catalogo_justicia.json'))
cls.gen_full_data = StatusIndicatorsGenerator(
cls.get_sample('full_data.json'))
cls.gen_empty = StatusIndicatorsGenerator(
cls.get_sample('invalid_catalog_empty.json'))

def test_just_datasets_cant(self):
self.assertEqual(16, self.gen_justicia.datasets_cant())
Expand Down
21 changes: 12 additions & 9 deletions tests/test_catalog_readme.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import io
import os.path

import requests_mock
import vcr
from nose.tools import assert_true, assert_equal

Expand Down Expand Up @@ -36,21 +37,25 @@ def get_sample(cls, sample_filename):
@classmethod
def setUp(cls):
cls.catalog = cls.get_sample("several_datasets_for_harvest.json")
cls.requests_mock = requests_mock.Mocker()
cls.requests_mock.start()
cls.requests_mock.get(requests_mock.ANY, real_http=True)
cls.requests_mock.head(requests_mock.ANY, status_code=200)

@classmethod
def tearDown(cls):
cls.requests_mock.stop()

@my_vcr.use_cassette()
@mock.patch('pydatajson.status_indicators_generator.is_working_url',
return_value=True)
def test_generate_readme(self, _mock_check):
def test_generate_readme(self):
with io.open(os.path.join(self.RESULTS_DIR, "catalog_readme.md"), 'r',
encoding='utf-8') as expected_readme_file:
expected_readme = expected_readme_file.read()
readme = generate_readme(self.catalog)
assert_equal(expected_readme, readme)

@my_vcr.use_cassette()
@mock.patch('pydatajson.status_indicators_generator.is_working_url',
return_value=True)
def test_readme_file_write(self, _mock_check):
def test_readme_file_write(self):
actual_filename = os.path.join(self.TEMP_DIR, "catalog_readme.md")
expected_filename = os.path.join(self.RESULTS_DIR, "catalog_readme.md")
generate_readme(self.catalog, export_path=actual_filename)
Expand All @@ -65,10 +70,8 @@ def test_readme_file_write(self, _mock_check):
assert_true(comparison)

@my_vcr.use_cassette()
@mock.patch('pydatajson.status_indicators_generator.is_working_url',
return_value=True)
@mock.patch('pydatajson.indicators._federation_indicators')
def test_readme_null_indicators(self, mock_indicators, _mock_check):
def test_readme_null_indicators(self, mock_indicators):
mock_indicators.return_value = {
'datasets_federados_cant': None,
'datasets_federados_pct': None,
Expand Down
19 changes: 17 additions & 2 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from nose.tools import assert_true, assert_false, assert_equal
from nose.tools import assert_list_equal, assert_raises
from six import iteritems

import requests_mock

try:
import mock
Expand Down Expand Up @@ -44,10 +44,15 @@ def setUp(cls):
cls.get_sample("full_data.json"))
cls.maxDiff = None
cls.longMessage = True
cls.requests_mock = requests_mock.Mocker()
cls.requests_mock.start()
cls.requests_mock.get(requests_mock.ANY, real_http=True)
cls.requests_mock.head(requests_mock.ANY, status_code=200)

@classmethod
def tearDown(cls):
del (cls.dj)
cls.requests_mock.stop()

# TESTS DE catalog_report
# Reporte esperado para "full_data.json", con harvest = 0
Expand Down Expand Up @@ -145,9 +150,13 @@ def tearDown(cls):
(u'dataset_temporal', u'2015-01-01/2015-12-31'),
(u'notas', u'No tiene distribuciones con datos.')])]

LANDING_PAGE = 'http://datos.gob.ar/dataset/' \
'sistema-de-contrataciones-electronicas-argentina-compra'

def test_catalog_report_harvest_good(self):
"""catalog_report() marcará para cosecha los datasets con metadata
válida si harvest='valid'."""

catalog = os.path.join(self.SAMPLES_DIR, "full_data.json")

actual = self.dj.catalog_report(
Expand All @@ -166,6 +175,7 @@ def test_catalog_report_harvest_good(self):
def test_catalog_report_harvest_valid(self):
"""catalog_report() marcará para cosecha los datasets con metadata
válida si harvest='valid'."""

catalog = os.path.join(self.SAMPLES_DIR, "full_data.json")

actual = self.dj.catalog_report(
Expand All @@ -184,6 +194,7 @@ def test_catalog_report_harvest_valid(self):
def test_catalog_report_harvest_none(self):
"""catalog_report() no marcará ningún dataset para cosecha si
harvest='none'."""

catalog = os.path.join(self.SAMPLES_DIR, "full_data.json")

actual = self.dj.catalog_report(
Expand Down Expand Up @@ -554,7 +565,11 @@ def test_generate_datasets_summary(self):
"""Genera informe conciso sobre datasets correctamente."""
catalog = os.path.join(self.SAMPLES_DIR,
"several_datasets_for_harvest.json")
actual = self.dj.generate_datasets_summary(catalog)

with mock.patch('pydatajson.validation.is_working_url',
return_value=(True, 200)):
actual = self.dj.generate_datasets_summary(catalog)

expected = [
OrderedDict([('indice', 0),
('titulo',
Expand Down

0 comments on commit ccddd6e

Please sign in to comment.