Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

252 broken links validation #281

Merged
merged 15 commits into from
Sep 27, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
51 changes: 50 additions & 1 deletion pydatajson/custom_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@

"""Excepciones personalizadas para validación y registro de errores"""

from __future__ import unicode_literals
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import with_statement

import os

try:
from urlparse import urlparse
except ImportError:
Expand Down Expand Up @@ -99,6 +101,53 @@ def __init__(self, dataset_idx, distribution_idx, distribution, attribute):
validator, message, validator_value, path)


class BrokenLandingPageError(BaseValidationError):

def __init__(self, dataset_idx, dataset_title, broken_url, status_code):

validator = "brokenLink"
message = "Dataset ({}) con 'landingPage' ({}) inválida ({})".format(
dataset_title, broken_url, status_code)
validator_value = "Chequea que la 'landingPage' devuelva un status " \
"code válido"
path = ['dataset', dataset_idx, 'landingPage']

super(BrokenLandingPageError, self).__init__(
validator, message, validator_value, path)


class BrokenAccessUrlError(BaseValidationError):

def __init__(self, dataset_idx, distribution_idx,
distribution_title, broken_url, status_code):
validator = "brokenLink"
message = "Distribution ({}) con 'accessUrl' ({}) inválida ({})".\
format(distribution_title, broken_url, status_code)
validator_value = "Chequea que el 'accessUrl' devuelva un status " \
"code válido"
path = ['dataset', dataset_idx, 'distribution', distribution_idx,
'accessUrl']

super(BrokenAccessUrlError, self).__init__(
validator, message, validator_value, path)


class BrokenDownloadUrlError(BaseValidationError):

def __init__(self, dataset_idx, distribution_idx, distribution_title,
broken_url, status_code):
validator = "brokenLink"
message = "Distribution ({}) con 'downloadUrl' ({}) inválida ({})".\
format(distribution_title, broken_url, status_code)
validator_value = "Chequea que el 'downloadUrl' devuelva un status " \
"code válido"
path = ['dataset', dataset_idx, 'distribution', distribution_idx,
'downloadUrl']

super(BrokenDownloadUrlError, self).__init__(
validator, message, validator_value, path)


class BaseUnexpectedValue(ValueError):

"""El id de una entidad está repetido en el catálogo."""
Expand Down
8 changes: 4 additions & 4 deletions pydatajson/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,7 @@ def fields_to_uppercase(fields):

def is_working_url(url):
try:
response = requests.head(url, timeout=3)
return response.status_code in VALID_STATUS_CODES
except RequestException:
return False
response = requests.head(url, timeout=1)
return response.status_code in VALID_STATUS_CODES, response.status_code
except (RequestException, Exception):
return False, None
2 changes: 1 addition & 1 deletion pydatajson/status_indicators_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def _validate_download_urls(self):
result = 0
for dataset in self.catalog.get('dataset', []):
for distribution in dataset.get('distribution', []):
valid = is_working_url(distribution.get('downloadURL', ''))
valid, _ = is_working_url(distribution.get('downloadURL', ''))
result += valid
# Guardo el resultado una vez calculado
self.download_url_ok = result
Expand Down
56 changes: 52 additions & 4 deletions pydatajson/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,17 @@
from __future__ import unicode_literals, print_function
from __future__ import with_statement, absolute_import

import logging
import mimetypes
import os
import platform
import mimetypes
import logging
from collections import Counter

import requests

from pydatajson.constants import VALID_STATUS_CODES
from pydatajson.helpers import is_working_url

try:
from urlparse import urlparse
except ImportError:
Expand Down Expand Up @@ -81,7 +86,7 @@ def _get_errors(self, catalog):
try:
for error in self._custom_errors(catalog):
errors.append(error)
except:
except Exception as e:
logger.warning("Error de validación")
return errors

Expand Down Expand Up @@ -125,7 +130,9 @@ def _custom_errors(self, catalog):
def _validators(self):
return [
self._theme_ids_not_repeated,
self._consistent_distribution_fields
self._consistent_distribution_fields,
self._validate_landing_pages,
self._validate_distributions_urls
]

def _theme_ids_not_repeated(self, catalog):
Expand Down Expand Up @@ -212,6 +219,47 @@ def _update_validation_response(self, error, response):

return new_response

def _validate_landing_pages(self, catalog):
datasets = catalog.get('dataset')
datasets = filter(lambda x: x.get('landingPage'), datasets)

for dataset_idx, dataset in enumerate(datasets):
dataset_title = dataset.get('title')
landing_page = dataset.get('landingPage')

valid, status_code = is_working_url(landing_page)
if not valid:
yield ce.BrokenLandingPageError(dataset_idx, dataset_title,
landing_page, status_code)

def _validate_distributions_urls(self, catalog):
datasets = catalog.get('dataset')

for dataset_idx, dataset in enumerate(datasets):
distributions = dataset.get('distribution')

for distribution_idx, distribution in enumerate(distributions):
distribution_title = distribution.get('title')
access_url = distribution.get('accessURL')
download_url = distribution.get('downloadURL')

access_url_is_valid, access_url_status_code = \
is_working_url(access_url)
download_url_is_valid, download_url_status_code = \
is_working_url(download_url)
if not access_url_is_valid:
yield ce.BrokenAccessUrlError(dataset_idx,
distribution_idx,
distribution_title,
access_url,
access_url_status_code)
if not download_url_is_valid:
yield ce.BrokenDownloadUrlError(dataset_idx,
distribution_idx,
distribution_title,
download_url,
download_url_status_code)


def is_valid_catalog(catalog, validator=None):
"""Valida que un archivo `data.json` cumpla con el schema definido.
Expand Down
2 changes: 1 addition & 1 deletion requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ pycallgraph
setuptools>=38.6
wheel>=0.31
vcrpy
requests_mock
requests-mock
10 changes: 5 additions & 5 deletions tests/results/mismatched_downloadURL_and_format.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
"status": "ERROR",
"error": {
"catalog": {
"status": "OK",
"errors": [],
"status": "OK",
"errors": [],
"title": "Datos Argentina"
},
},
"dataset": [
{
"status": "ERROR",
Expand All @@ -25,7 +25,7 @@
"error_code": 2,
"validator_value": "Chequea format y la extension del downloadURL"
}
],
],
"title": "Sistema de contrataciones electr\u00f3nicas"
},
{
Expand Down Expand Up @@ -58,7 +58,7 @@
"message": "distribution 'd_7d4d816f-3a40-476e-ab71-d48a3f0eb3ca' tiene distintas extensiones: format ('CSV') y downloadURL ('.puntos')",
"error_code": 2,
"validator_value": "Chequea format y la extension del downloadURL"
},
},
{
"instance": null,
"validator": "mismatchedValue",
Expand Down
7 changes: 4 additions & 3 deletions tests/results/multiple_missing_descriptions.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"instance": null,
"validator": "required",
"path": [],
"message": "u'description' is a required property",
"message": "'description' is a required property",
"error_code": 1,
"validator_value": [
"dataset",
Expand All @@ -34,7 +34,7 @@
"dataset",
0
],
"message": "u'description' is a required property",
"message": "'description' is a required property",
"error_code": 1,
"validator_value": [
"title",
Expand All @@ -43,7 +43,8 @@
"superTheme",
"distribution",
"accrualPeriodicity",
"issued"
"issued",
"identifier"
]
}
],
Expand Down
14 changes: 8 additions & 6 deletions tests/status_indicators_generator_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@ def get_sample(cls, sample_filename):

@classmethod
def setUpClass(cls):
cls.gen_justicia = StatusIndicatorsGenerator(
cls.get_sample('catalogo_justicia.json'))
cls.gen_full_data = StatusIndicatorsGenerator(
cls.get_sample('full_data.json'))
cls.gen_empty = StatusIndicatorsGenerator(
cls.get_sample('invalid_catalog_empty.json'))
with mock.patch('pydatajson.validation.is_working_url',
return_value=(True, 200)):
cls.gen_justicia = StatusIndicatorsGenerator(
cls.get_sample('catalogo_justicia.json'))
cls.gen_full_data = StatusIndicatorsGenerator(
cls.get_sample('full_data.json'))
cls.gen_empty = StatusIndicatorsGenerator(
cls.get_sample('invalid_catalog_empty.json'))

def test_just_datasets_cant(self):
self.assertEqual(16, self.gen_justicia.datasets_cant())
Expand Down
21 changes: 12 additions & 9 deletions tests/test_catalog_readme.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import io
import os.path

import requests_mock
import vcr
from nose.tools import assert_true, assert_equal

Expand Down Expand Up @@ -36,21 +37,25 @@ def get_sample(cls, sample_filename):
@classmethod
def setUp(cls):
cls.catalog = cls.get_sample("several_datasets_for_harvest.json")
cls.requests_mock = requests_mock.Mocker()
cls.requests_mock.start()
cls.requests_mock.get(requests_mock.ANY, real_http=True)
cls.requests_mock.head(requests_mock.ANY, status_code=200)

@classmethod
def tearDown(cls):
cls.requests_mock.stop()

@my_vcr.use_cassette()
@mock.patch('pydatajson.status_indicators_generator.is_working_url',
return_value=True)
def test_generate_readme(self, _mock_check):
def test_generate_readme(self):
with io.open(os.path.join(self.RESULTS_DIR, "catalog_readme.md"), 'r',
encoding='utf-8') as expected_readme_file:
expected_readme = expected_readme_file.read()
readme = generate_readme(self.catalog)
assert_equal(expected_readme, readme)

@my_vcr.use_cassette()
@mock.patch('pydatajson.status_indicators_generator.is_working_url',
return_value=True)
def test_readme_file_write(self, _mock_check):
def test_readme_file_write(self):
actual_filename = os.path.join(self.TEMP_DIR, "catalog_readme.md")
expected_filename = os.path.join(self.RESULTS_DIR, "catalog_readme.md")
generate_readme(self.catalog, export_path=actual_filename)
Expand All @@ -65,10 +70,8 @@ def test_readme_file_write(self, _mock_check):
assert_true(comparison)

@my_vcr.use_cassette()
@mock.patch('pydatajson.status_indicators_generator.is_working_url',
return_value=True)
@mock.patch('pydatajson.indicators._federation_indicators')
def test_readme_null_indicators(self, mock_indicators, _mock_check):
def test_readme_null_indicators(self, mock_indicators):
mock_indicators.return_value = {
'datasets_federados_cant': None,
'datasets_federados_pct': None,
Expand Down
19 changes: 17 additions & 2 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from nose.tools import assert_true, assert_false, assert_equal
from nose.tools import assert_list_equal, assert_raises
from six import iteritems

import requests_mock

try:
import mock
Expand Down Expand Up @@ -44,10 +44,15 @@ def setUp(cls):
cls.get_sample("full_data.json"))
cls.maxDiff = None
cls.longMessage = True
cls.requests_mock = requests_mock.Mocker()
cls.requests_mock.start()
cls.requests_mock.get(requests_mock.ANY, real_http=True)
cls.requests_mock.head(requests_mock.ANY, status_code=200)

@classmethod
def tearDown(cls):
del (cls.dj)
cls.requests_mock.stop()

# TESTS DE catalog_report
# Reporte esperado para "full_data.json", con harvest = 0
Expand Down Expand Up @@ -145,9 +150,13 @@ def tearDown(cls):
(u'dataset_temporal', u'2015-01-01/2015-12-31'),
(u'notas', u'No tiene distribuciones con datos.')])]

LANDING_PAGE = 'http://datos.gob.ar/dataset/' \
'sistema-de-contrataciones-electronicas-argentina-compra'

def test_catalog_report_harvest_good(self):
"""catalog_report() marcará para cosecha los datasets con metadata
válida si harvest='valid'."""

catalog = os.path.join(self.SAMPLES_DIR, "full_data.json")

actual = self.dj.catalog_report(
Expand All @@ -166,6 +175,7 @@ def test_catalog_report_harvest_good(self):
def test_catalog_report_harvest_valid(self):
"""catalog_report() marcará para cosecha los datasets con metadata
válida si harvest='valid'."""

catalog = os.path.join(self.SAMPLES_DIR, "full_data.json")

actual = self.dj.catalog_report(
Expand All @@ -184,6 +194,7 @@ def test_catalog_report_harvest_valid(self):
def test_catalog_report_harvest_none(self):
"""catalog_report() no marcará ningún dataset para cosecha si
harvest='none'."""

catalog = os.path.join(self.SAMPLES_DIR, "full_data.json")

actual = self.dj.catalog_report(
Expand Down Expand Up @@ -554,7 +565,11 @@ def test_generate_datasets_summary(self):
"""Genera informe conciso sobre datasets correctamente."""
catalog = os.path.join(self.SAMPLES_DIR,
"several_datasets_for_harvest.json")
actual = self.dj.generate_datasets_summary(catalog)

with mock.patch('pydatajson.validation.is_working_url',
return_value=(True, 200)):
actual = self.dj.generate_datasets_summary(catalog)

expected = [
OrderedDict([('indice', 0),
('titulo',
Expand Down