Merge pull request #281 from datosgobar/252-borken-links-validation

252 broken links validation
datosgobar · Sep 27, 2019 · ccddd6e · ccddd6e
2 parents 19944ae + ba3458e
commit ccddd6e
Show file tree

Hide file tree

Showing 14 changed files with 268 additions and 149 deletions.
diff --git a/pydatajson/custom_exceptions.py b/pydatajson/custom_exceptions.py
@@ -3,10 +3,12 @@
 
 """Excepciones personalizadas para validación y registro de errores"""
 
-from __future__ import unicode_literals
 from __future__ import print_function
+from __future__ import unicode_literals
 from __future__ import with_statement
+
 import os
+
 try:
     from urlparse import urlparse
 except ImportError:
@@ -99,6 +101,53 @@ def __init__(self, dataset_idx, distribution_idx, distribution, attribute):
             validator, message, validator_value, path)
 
 
+class BrokenLandingPageError(BaseValidationError):
+
+    def __init__(self, dataset_idx, dataset_title, broken_url, status_code):
+
+        validator = "brokenLink"
+        message = "Dataset ({}) con 'landingPage' ({}) inválida ({})".format(
+            dataset_title, broken_url, status_code)
+        validator_value = "Chequea que la 'landingPage' devuelva un status " \
+                          "code válido"
+        path = ['dataset', dataset_idx, 'landingPage']
+
+        super(BrokenLandingPageError, self).__init__(
+            validator, message, validator_value, path)
+
+
+class BrokenAccessUrlError(BaseValidationError):
+
+    def __init__(self, dataset_idx, distribution_idx,
+                 distribution_title, broken_url, status_code):
+        validator = "brokenLink"
+        message = "Distribution ({}) con 'accessUrl' ({}) inválida ({})".\
+            format(distribution_title, broken_url, status_code)
+        validator_value = "Chequea que el 'accessUrl' devuelva un status " \
+                          "code válido"
+        path = ['dataset', dataset_idx, 'distribution', distribution_idx,
+                'accessUrl']
+
+        super(BrokenAccessUrlError, self).__init__(
+            validator, message, validator_value, path)
+
+
+class BrokenDownloadUrlError(BaseValidationError):
+
+    def __init__(self, dataset_idx, distribution_idx, distribution_title,
+                 broken_url, status_code):
+        validator = "brokenLink"
+        message = "Distribution ({}) con 'downloadUrl' ({}) inválida ({})".\
+            format(distribution_title, broken_url, status_code)
+        validator_value = "Chequea que el 'downloadUrl' devuelva un status " \
+                          "code válido"
+        path = ['dataset', dataset_idx, 'distribution', distribution_idx,
+                'downloadUrl']
+
+        super(BrokenDownloadUrlError, self).__init__(
+            validator, message, validator_value, path)
+
+
 class BaseUnexpectedValue(ValueError):
 
     """El id de una entidad está repetido en el catálogo."""

diff --git a/pydatajson/helpers.py b/pydatajson/helpers.py
@@ -569,7 +569,7 @@ def fields_to_uppercase(fields):
 
 def is_working_url(url):
     try:
-        response = requests.head(url, timeout=3)
-        return response.status_code in VALID_STATUS_CODES
-    except RequestException:
-        return False
+        response = requests.head(url, timeout=1)
+        return response.status_code in VALID_STATUS_CODES, response.status_code
+    except (RequestException, Exception):
+        return False, None
diff --git a/pydatajson/status_indicators_generator.py b/pydatajson/status_indicators_generator.py
@@ -53,7 +53,7 @@ def _validate_download_urls(self):
         result = 0
         for dataset in self.catalog.get('dataset', []):
             for distribution in dataset.get('distribution', []):
-                valid = is_working_url(distribution.get('downloadURL', ''))
+                valid, _ = is_working_url(distribution.get('downloadURL', ''))
                 result += valid
         # Guardo el resultado una vez calculado
         self.download_url_ok = result

diff --git a/pydatajson/validation.py b/pydatajson/validation.py
@@ -9,12 +9,17 @@
 from __future__ import unicode_literals, print_function
 from __future__ import with_statement, absolute_import
 
+import logging
+import mimetypes
 import os
 import platform
-import mimetypes
-import logging
 from collections import Counter
 
+import requests
+
+from pydatajson.constants import VALID_STATUS_CODES
+from pydatajson.helpers import is_working_url
+
 try:
     from urlparse import urlparse
 except ImportError:
@@ -81,7 +86,7 @@ def _get_errors(self, catalog):
         try:
             for error in self._custom_errors(catalog):
                 errors.append(error)
-        except:
+        except Exception as e:
             logger.warning("Error de validación")
         return errors
 
@@ -125,7 +130,9 @@ def _custom_errors(self, catalog):
     def _validators(self):
         return [
             self._theme_ids_not_repeated,
-            self._consistent_distribution_fields
+            self._consistent_distribution_fields,
+            self._validate_landing_pages,
+            self._validate_distributions_urls
         ]
 
     def _theme_ids_not_repeated(self, catalog):
@@ -212,6 +219,47 @@ def _update_validation_response(self, error, response):
 
         return new_response
 
+    def _validate_landing_pages(self, catalog):
+        datasets = catalog.get('dataset')
+        datasets = filter(lambda x: x.get('landingPage'), datasets)
+
+        for dataset_idx, dataset in enumerate(datasets):
+            dataset_title = dataset.get('title')
+            landing_page = dataset.get('landingPage')
+
+            valid, status_code = is_working_url(landing_page)
+            if not valid:
+                yield ce.BrokenLandingPageError(dataset_idx, dataset_title,
+                                                landing_page, status_code)
+
+    def _validate_distributions_urls(self, catalog):
+        datasets = catalog.get('dataset')
+
+        for dataset_idx, dataset in enumerate(datasets):
+            distributions = dataset.get('distribution')
+
+            for distribution_idx, distribution in enumerate(distributions):
+                distribution_title = distribution.get('title')
+                access_url = distribution.get('accessURL')
+                download_url = distribution.get('downloadURL')
+
+                access_url_is_valid, access_url_status_code = \
+                    is_working_url(access_url)
+                download_url_is_valid, download_url_status_code = \
+                    is_working_url(download_url)
+                if not access_url_is_valid:
+                    yield ce.BrokenAccessUrlError(dataset_idx,
+                                                  distribution_idx,
+                                                  distribution_title,
+                                                  access_url,
+                                                  access_url_status_code)
+                if not download_url_is_valid:
+                    yield ce.BrokenDownloadUrlError(dataset_idx,
+                                                    distribution_idx,
+                                                    distribution_title,
+                                                    download_url,
+                                                    download_url_status_code)
+
 
 def is_valid_catalog(catalog, validator=None):
     """Valida que un archivo `data.json` cumpla con el schema definido.

diff --git a/requirements_dev.txt b/requirements_dev.txt
@@ -15,4 +15,4 @@ pycallgraph
 setuptools>=38.6
 wheel>=0.31
 vcrpy
-requests_mock
+requests-mock
diff --git a/tests/results/mismatched_downloadURL_and_format.json b/tests/results/mismatched_downloadURL_and_format.json
@@ -2,10 +2,10 @@
     "status": "ERROR", 
     "error": {
         "catalog": {
-            "status": "OK", 
-            "errors": [], 
+            "status": "OK",
+            "errors": [],
             "title": "Datos Argentina"
-        }, 
+        },
         "dataset": [
             {
                 "status": "ERROR", 
@@ -25,7 +25,7 @@
                         "error_code": 2, 
                         "validator_value": "Chequea format y la extension del downloadURL"
                     }
-                ], 
+                ],
                 "title": "Sistema de contrataciones electr\u00f3nicas"
             }, 
             {
@@ -58,7 +58,7 @@
                         "message": "distribution 'd_7d4d816f-3a40-476e-ab71-d48a3f0eb3ca' tiene distintas extensiones: format ('CSV') y downloadURL ('.puntos')", 
                         "error_code": 2, 
                         "validator_value": "Chequea format y la extension del downloadURL"
-                    }, 
+                    },
                     {
                         "instance": null, 
                         "validator": "mismatchedValue", 

diff --git a/tests/results/multiple_missing_descriptions.json b/tests/results/multiple_missing_descriptions.json
@@ -8,7 +8,7 @@
                     "instance": null,
                     "validator": "required",
                     "path": [],
-                    "message": "u'description' is a required property",
+                    "message": "'description' is a required property",
                     "error_code": 1,
                     "validator_value": [
                         "dataset",
@@ -34,7 +34,7 @@
                             "dataset",
                             0
                         ],
-                        "message": "u'description' is a required property",
+                        "message": "'description' is a required property",
                         "error_code": 1,
                         "validator_value": [
                             "title",
@@ -43,7 +43,8 @@
                             "superTheme",
                             "distribution",
                             "accrualPeriodicity",
-                            "issued"
+                            "issued",
+                            "identifier"
                         ]
                     }
                 ],

diff --git a/tests/status_indicators_generator_tests.py b/tests/status_indicators_generator_tests.py
@@ -23,12 +23,14 @@ def get_sample(cls, sample_filename):
 
     @classmethod
     def setUpClass(cls):
-        cls.gen_justicia = StatusIndicatorsGenerator(
-            cls.get_sample('catalogo_justicia.json'))
-        cls.gen_full_data = StatusIndicatorsGenerator(
-            cls.get_sample('full_data.json'))
-        cls.gen_empty = StatusIndicatorsGenerator(
-            cls.get_sample('invalid_catalog_empty.json'))
+        with mock.patch('pydatajson.validation.is_working_url',
+                        return_value=(True, 200)):
+            cls.gen_justicia = StatusIndicatorsGenerator(
+                cls.get_sample('catalogo_justicia.json'))
+            cls.gen_full_data = StatusIndicatorsGenerator(
+                cls.get_sample('full_data.json'))
+            cls.gen_empty = StatusIndicatorsGenerator(
+                cls.get_sample('invalid_catalog_empty.json'))
 
     def test_just_datasets_cant(self):
         self.assertEqual(16, self.gen_justicia.datasets_cant())

diff --git a/tests/test_catalog_readme.py b/tests/test_catalog_readme.py
@@ -7,6 +7,7 @@
 import io
 import os.path
 
+import requests_mock
 import vcr
 from nose.tools import assert_true, assert_equal
 
@@ -36,21 +37,25 @@ def get_sample(cls, sample_filename):
     @classmethod
     def setUp(cls):
         cls.catalog = cls.get_sample("several_datasets_for_harvest.json")
+        cls.requests_mock = requests_mock.Mocker()
+        cls.requests_mock.start()
+        cls.requests_mock.get(requests_mock.ANY, real_http=True)
+        cls.requests_mock.head(requests_mock.ANY, status_code=200)
+
+    @classmethod
+    def tearDown(cls):
+        cls.requests_mock.stop()
 
     @my_vcr.use_cassette()
-    @mock.patch('pydatajson.status_indicators_generator.is_working_url',
-                return_value=True)
-    def test_generate_readme(self, _mock_check):
+    def test_generate_readme(self):
         with io.open(os.path.join(self.RESULTS_DIR, "catalog_readme.md"), 'r',
                      encoding='utf-8') as expected_readme_file:
             expected_readme = expected_readme_file.read()
             readme = generate_readme(self.catalog)
             assert_equal(expected_readme, readme)
 
     @my_vcr.use_cassette()
-    @mock.patch('pydatajson.status_indicators_generator.is_working_url',
-                return_value=True)
-    def test_readme_file_write(self, _mock_check):
+    def test_readme_file_write(self):
         actual_filename = os.path.join(self.TEMP_DIR, "catalog_readme.md")
         expected_filename = os.path.join(self.RESULTS_DIR, "catalog_readme.md")
         generate_readme(self.catalog, export_path=actual_filename)
@@ -65,10 +70,8 @@ def test_readme_file_write(self, _mock_check):
         assert_true(comparison)
 
     @my_vcr.use_cassette()
-    @mock.patch('pydatajson.status_indicators_generator.is_working_url',
-                return_value=True)
     @mock.patch('pydatajson.indicators._federation_indicators')
-    def test_readme_null_indicators(self, mock_indicators, _mock_check):
+    def test_readme_null_indicators(self, mock_indicators):
         mock_indicators.return_value = {
             'datasets_federados_cant': None,
             'datasets_federados_pct': None,

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -13,7 +13,7 @@
 from nose.tools import assert_true, assert_false, assert_equal
 from nose.tools import assert_list_equal, assert_raises
 from six import iteritems
-
+import requests_mock
 
 try:
     import mock
@@ -44,10 +44,15 @@ def setUp(cls):
             cls.get_sample("full_data.json"))
         cls.maxDiff = None
         cls.longMessage = True
+        cls.requests_mock = requests_mock.Mocker()
+        cls.requests_mock.start()
+        cls.requests_mock.get(requests_mock.ANY, real_http=True)
+        cls.requests_mock.head(requests_mock.ANY, status_code=200)
 
     @classmethod
     def tearDown(cls):
         del (cls.dj)
+        cls.requests_mock.stop()
 
     # TESTS DE catalog_report
     # Reporte esperado para "full_data.json", con harvest = 0
@@ -145,9 +150,13 @@ def tearDown(cls):
             (u'dataset_temporal', u'2015-01-01/2015-12-31'),
             (u'notas', u'No tiene distribuciones con datos.')])]
 
+    LANDING_PAGE = 'http://datos.gob.ar/dataset/' \
+                   'sistema-de-contrataciones-electronicas-argentina-compra'
+
     def test_catalog_report_harvest_good(self):
         """catalog_report() marcará para cosecha los datasets con metadata
         válida si harvest='valid'."""
+
         catalog = os.path.join(self.SAMPLES_DIR, "full_data.json")
 
         actual = self.dj.catalog_report(
@@ -166,6 +175,7 @@ def test_catalog_report_harvest_good(self):
     def test_catalog_report_harvest_valid(self):
         """catalog_report() marcará para cosecha los datasets con metadata
         válida si harvest='valid'."""
+
         catalog = os.path.join(self.SAMPLES_DIR, "full_data.json")
 
         actual = self.dj.catalog_report(
@@ -184,6 +194,7 @@ def test_catalog_report_harvest_valid(self):
     def test_catalog_report_harvest_none(self):
         """catalog_report() no marcará ningún dataset para cosecha si
         harvest='none'."""
+
         catalog = os.path.join(self.SAMPLES_DIR, "full_data.json")
 
         actual = self.dj.catalog_report(
@@ -554,7 +565,11 @@ def test_generate_datasets_summary(self):
         """Genera informe conciso sobre datasets correctamente."""
         catalog = os.path.join(self.SAMPLES_DIR,
                                "several_datasets_for_harvest.json")
-        actual = self.dj.generate_datasets_summary(catalog)
+
+        with mock.patch('pydatajson.validation.is_working_url',
+                        return_value=(True, 200)):
+            actual = self.dj.generate_datasets_summary(catalog)
+
         expected = [
             OrderedDict([('indice', 0),
                          ('titulo',