Skip to content

Commit

Permalink
Merge pull request #295 from datosgobar/261-validacion-electiva-de-ssl
Browse files Browse the repository at this point in the history
validacion electiva de ssl
  • Loading branch information
AWolfsdorf committed Oct 22, 2019
2 parents fc3c27d + 5881dfc commit 4a9006d
Show file tree
Hide file tree
Showing 23 changed files with 565 additions and 305 deletions.
13 changes: 7 additions & 6 deletions pydatajson/catalog_readme.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
from __future__ import with_statement

import io
import os
import logging
import os

from six import string_types

from pydatajson.helpers import traverse_dict
from pydatajson.readers import read_catalog
from pydatajson.indicators import generate_catalogs_indicators
from pydatajson.readers import read_catalog
from pydatajson.validation import validate_catalog

logger = logging.getLogger('pydatajson')
Expand All @@ -23,13 +23,14 @@
TEMPLATES_PATH = os.path.join(ABSOLUTE_PROJECT_DIR, "templates")


def generate_catalog_readme(_datajson, catalog, export_path=None):
def generate_catalog_readme(_datajson, catalog,
export_path=None, verify_ssl=True):
"""Este método está para mantener retrocompatibilidad con versiones
anteriores. Se ignora el argumento _data_json."""
return generate_readme(catalog, export_path)
return generate_readme(catalog, export_path, verify_ssl=verify_ssl)


def generate_readme(catalog, export_path=None):
def generate_readme(catalog, export_path=None, verify_ssl=True):
"""Genera una descripción textual en formato Markdown sobre los
metadatos generales de un catálogo (título, editor, fecha de
publicación, et cetera), junto con:
Expand Down Expand Up @@ -59,7 +60,7 @@ def generate_readme(catalog, export_path=None):
catalog_path_or_url = None

catalog = read_catalog(catalog)
validation = validate_catalog(catalog)
validation = validate_catalog(catalog, verify_ssl=verify_ssl)
# Solo necesito indicadores para un catalogo
indicators = generate_catalogs_indicators(
catalog, CENTRAL_CATALOG)[0][0]
Expand Down
23 changes: 14 additions & 9 deletions pydatajson/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
from __future__ import with_statement

import json
import logging
import os.path
import sys
import logging
from collections import OrderedDict
from datetime import datetime

Expand All @@ -25,17 +25,17 @@
from pydatajson.response_formatters import format_response
from pydatajson.validation import Validator, \
DEFAULT_CATALOG_SCHEMA_FILENAME, ABSOLUTE_SCHEMA_DIR
from . import backup
from . import catalog_readme
from . import documentation, constants
from . import federation
from . import helpers
from . import indicators
from . import readers
from . import search
from . import time_series
from . import writers
from . import federation
from . import transformation
from . import backup
from . import catalog_readme
from . import writers

logger = logging.getLogger('pydatajson')

Expand Down Expand Up @@ -250,7 +250,9 @@ def is_valid_catalog(self, catalog=None, broken_links=False):
bool: True si el data.json cumple con el schema, sino False.
"""
catalog = self._read_catalog(catalog) if catalog else self
return self.validator.is_valid(catalog, broken_links=broken_links)
return self.validator.is_valid(catalog,
broken_links=broken_links,
verify_ssl=self.verify_ssl)

@staticmethod
def _update_validation_response(error, response):
Expand Down Expand Up @@ -342,8 +344,10 @@ def validate_catalog(self, catalog=None, only_errors=False, fmt="dict",
"""
catalog = self._read_catalog(catalog) if catalog else self

validation = self.validator.validate_catalog(catalog, only_errors,
broken_links)
validation = self.validator.validate_catalog(catalog,
only_errors,
broken_links,
self.verify_ssl)
if export_path:
fmt = 'table'

Expand Down Expand Up @@ -963,7 +967,8 @@ def generate_catalogs_indicators(self, catalogs=None,
catalogs = catalogs or self
return indicators.generate_catalogs_indicators(
catalogs, central_catalog, identifier_search=identifier_search,
validator=self.validator, broken_links=broken_links)
validator=self.validator, broken_links=broken_links,
verify_ssl=self.verify_ssl)

def _count_fields_recursive(self, dataset, fields):
"""Cuenta la información de campos optativos/recomendados/requeridos
Expand Down
39 changes: 8 additions & 31 deletions pydatajson/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,23 @@

"""Métodos auxiliares"""

from __future__ import unicode_literals
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import with_statement

from datetime import datetime
import os
import json
import re
import logging
import os
import re
import tempfile

from contextlib import contextmanager
from datetime import datetime

import requests
from openpyxl import load_workbook
from requests import RequestException, Timeout
from six.moves.urllib_parse import urlparse

from six import string_types, iteritems
from six.moves.urllib_parse import urlparse
from unidecode import unidecode

from pydatajson.constants import \
INVALID_STATUS_CODES_REGEX, \
EXCEPTION_STATUS_CODES
from pydatajson.download import download_to_file

logger = logging.getLogger('pydatajson.helpers')
Expand All @@ -49,14 +42,13 @@


def count_distribution_formats_dataset(dataset):

formats = {}
for distribution in dataset['distribution']:
# 'format' es recomendado, no obligatorio. Puede no estar.
# 'format' es recomendado, no obligatorio. Puede no estar.
distribution_format = distribution.get('format', None)

if distribution_format:
# Si no está en el diccionario, devuelvo 0
# Si no está en el diccionario, devuelvo 0
count = formats.get(distribution_format, 0)

formats[distribution_format] = count + 1
Expand Down Expand Up @@ -549,7 +541,7 @@ def filter_by_likely_publisher(central_datasets, catalog_datasets):

def title_in_dataset_list(dataset, dataset_list):
return (dataset.get('title'), dataset.get('landingPage')) \
in dataset_list
in dataset_list


def fields_to_uppercase(fields):
Expand All @@ -568,18 +560,3 @@ def fields_to_uppercase(fields):
uppercase_counts + lowercase_counts + counts

return uppercase_fields


def is_working_url(url):
try:
response = requests.head(url, timeout=1)
matches = []
if response.status_code not in EXCEPTION_STATUS_CODES:
matches = \
[re.match(pattern, str(response.status_code)) is not None
for pattern in INVALID_STATUS_CODES_REGEX]
return True not in matches, response.status_code
except Timeout:
return False, 408
except (RequestException, Exception):
return False, None
29 changes: 19 additions & 10 deletions pydatajson/indicators.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
from six import string_types

from pydatajson.helpers import fields_to_uppercase
from pydatajson.status_indicators_generator import StatusIndicatorsGenerator
from . import helpers
from . import readers
from .federation_indicators_generator import FederationIndicatorsGenerator
from pydatajson.status_indicators_generator import StatusIndicatorsGenerator
from .search import get_datasets, get_distributions

CENTRAL_CATALOG = "http://datos.gob.ar/data.json"
Expand All @@ -46,7 +46,8 @@ def generate_numeric_indicators(catalog, validator=None):
def generate_catalogs_indicators(catalogs, central_catalog=None,
identifier_search=False,
broken_links=False,
validator=None):
validator=None,
verify_ssl=True):
"""Genera una lista de diccionarios con varios indicadores sobre
los catálogos provistos, tales como la cantidad de datasets válidos,
días desde su última fecha actualizada, entre otros.
Expand Down Expand Up @@ -83,7 +84,8 @@ def generate_catalogs_indicators(catalogs, central_catalog=None,
continue

fields_count, result = _generate_indicators(
catalog, validator=validator, broken_links=broken_links)
catalog, validator=validator,
broken_links=broken_links, verify_ssl=verify_ssl)
if central_catalog:
result.update(_federation_indicators(
catalog, central_catalog, identifier_search=identifier_search))
Expand Down Expand Up @@ -113,7 +115,7 @@ def generate_catalogs_indicators(catalogs, central_catalog=None,


def _generate_indicators(catalog, validator=None, only_numeric=False,
broken_links=False):
broken_links=False, verify_ssl=True):
"""Genera los indicadores de un catálogo individual.
Args:
Expand All @@ -126,12 +128,14 @@ def _generate_indicators(catalog, validator=None, only_numeric=False,
result = {}

# Obtengo summary para los indicadores del estado de los metadatos
result.update(_generate_status_indicators(catalog, validator=validator))
result.update(_generate_status_indicators(catalog, validator=validator,
verify_ssl=verify_ssl))

# Genero indicadores relacionados con validacion de urls
if broken_links:
result.update(_generate_valid_urls_indicators(catalog,
validator=validator))
validator=validator,
verify_ssl=verify_ssl))

# Genero los indicadores relacionados con fechas, y los agrego
result.update(
Expand Down Expand Up @@ -275,7 +279,7 @@ def _network_indicator_percentages(fields, network_indicators):
})


def _generate_status_indicators(catalog, validator=None):
def _generate_status_indicators(catalog, validator=None, verify_ssl=True):
"""Genera indicadores básicos sobre el estado de un catálogo
Args:
Expand All @@ -296,7 +300,9 @@ def _generate_status_indicators(catalog, validator=None):
'datasets_con_datos_pct': None
}
try:
generator = StatusIndicatorsGenerator(catalog, validator=validator)
generator = StatusIndicatorsGenerator(catalog,
validator=validator,
verify_ssl=verify_ssl)
except Exception as e:
msg = u'Error generando resumen del catálogo {}: {}'.format(
catalog['title'], str(e))
Expand Down Expand Up @@ -556,7 +562,7 @@ def _eventual_periodicity(periodicity):
return periodicity in ('eventual', 'EVENTUAL')


def _generate_valid_urls_indicators(catalog, validator=None):
def _generate_valid_urls_indicators(catalog, validator=None, verify_ssl=True):
"""Genera indicadores sobre el estado de las urls de distribuciones
Args:
Expand All @@ -568,7 +574,10 @@ def _generate_valid_urls_indicators(catalog, validator=None):

result = {}
try:
generator = StatusIndicatorsGenerator(catalog, validator=validator)
generator = \
StatusIndicatorsGenerator(catalog,
validator=validator,
verify_ssl=verify_ssl)
except Exception as e:
msg = u'Error generando resumen del catálogo {}: {}'.format(
catalog['title'], str(e))
Expand Down
14 changes: 8 additions & 6 deletions pydatajson/reporting.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,19 @@
Contiene los métodos para generar reportes sobre un catálogo.
"""

from __future__ import unicode_literals, print_function,\
from __future__ import unicode_literals, print_function, \
with_statement, absolute_import

from collections import OrderedDict

from pydatajson import writers
from .validation import validate_catalog

from . import readers
from . import helpers
from . import readers
from .validation import validate_catalog


def generate_datasets_summary(catalog, export_path=None, validator=None):
def generate_datasets_summary(catalog, export_path=None,
validator=None, verify_ssl=True):
"""Genera un informe sobre los datasets presentes en un catálogo,
indicando para cada uno:
- Índice en la lista catalog["dataset"]
Expand Down Expand Up @@ -53,7 +53,9 @@ def generate_datasets_summary(catalog, export_path=None, validator=None):
datasets = []

validation = validate_catalog(
catalog, validator=validator)["error"]["dataset"]
catalog,
validator=validator,
verify_ssl=verify_ssl)["error"]["dataset"]

def info_dataset(index, dataset):
"""Recolecta información básica de un dataset."""
Expand Down
38 changes: 13 additions & 25 deletions pydatajson/status_indicators_generator.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
from pydatajson import threading_helper
from pydatajson import constants
from pydatajson.helpers import is_working_url
from pydatajson.readers import read_catalog
from pydatajson.reporting import generate_datasets_summary
from pydatajson.validators\
.distribution_download_urls_validator \
import DistributionDownloadUrlsValidator


class StatusIndicatorsGenerator(object):

def __init__(self, catalog, validator=None):
def __init__(self, catalog, validator=None, verify_ssl=True):
self.download_url_ok = None
self.catalog = read_catalog(catalog)
self.summary = generate_datasets_summary(self.catalog,
validator=validator)
validator=validator,
verify_ssl=verify_ssl)
self.verify_url = verify_ssl

def datasets_cant(self):
return len(self.summary)
Expand All @@ -38,7 +40,12 @@ def datasets_con_datos_pct(self):
return self._get_dataset_percentage(self.datasets_con_datos_cant)

def distribuciones_download_url_ok_cant(self):
return self.download_url_ok or self._validate_download_urls()
if self.download_url_ok:
return self.download_url_ok
validator = DistributionDownloadUrlsValidator(
self.catalog, self.verify_url)
self.download_url_ok = validator.validate()
return self.download_url_ok

def distribuciones_download_url_error_cant(self):
return self.distribuciones_cant() - \
Expand All @@ -51,25 +58,6 @@ def distribuciones_download_url_ok_pct(self):
return \
round(float(self.distribuciones_download_url_ok_cant()) / total, 4)

def _validate_download_urls(self):
async_results = []
for dataset in self.catalog.get('dataset', []):
distribution_urls = \
[distribution.get('downloadURL', '')
for distribution in dataset.get('distribution', [])]
async_results += threading_helper\
.apply_threading(distribution_urls,
is_working_url,
constants.CANT_THREADS_BROKEN_URL_VALIDATOR)

result = 0
for res, _ in async_results:
result += res

# Guardo el resultado una vez calculado
self.download_url_ok = result
return result

def _get_dataset_percentage(self, indicator):
total = self.datasets_cant()
if not total:
Expand Down

0 comments on commit 4a9006d

Please sign in to comment.