Merge pull request #259 from datosgobar/253-config-verify-ssl

Parametrizo verificación de SSL y timeouts dentro de DataJson
datosgobar · May 28, 2019 · 6378198 · 6378198
2 parents 66bda5e + f7ab1b3
commit 6378198
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 17 deletions.
diff --git a/pydatajson/constants.py b/pydatajson/constants.py
@@ -0,0 +1 @@
+REQUESTS_TIMEOUT = 30
diff --git a/pydatajson/core.py b/pydatajson/core.py
@@ -25,7 +25,7 @@
 from pydatajson.response_formatters import format_response
 from pydatajson.validation import Validator, \
     DEFAULT_CATALOG_SCHEMA_FILENAME, ABSOLUTE_SCHEMA_DIR
-from . import documentation
+from . import documentation, constants
 from . import helpers
 from . import indicators
 from . import readers
@@ -53,7 +53,8 @@ class DataJson(dict):
 
     def __init__(self, catalog=None, schema_filename=None, schema_dir=None,
                  default_values=None, catalog_format=None,
-                 validator_class=Validator):
+                 validator_class=Validator, verify_ssl=False,
+                 requests_timeout=constants.REQUESTS_TIMEOUT):
         """Lee un catálogo y crea un objeto con funciones para manipularlo.
 
         Salvo que se indique lo contrario, se utiliza como default el schema
@@ -78,13 +79,17 @@ def __init__(self, catalog=None, schema_filename=None, schema_dir=None,
                         "distribution_issued": "2017-06-22"
                     }
         """
+        self.verify_ssl = verify_ssl
+        self.requests_timeout = requests_timeout
         # se construye el objeto DataJson con la interfaz de un dicconario
         if catalog:
 
             # lee representaciones de un catálogo hacia un diccionario
             catalog = readers.read_catalog(catalog,
                                            default_values=default_values,
-                                           catalog_format=catalog_format)
+                                           catalog_format=catalog_format,
+                                           verify=self.verify_ssl,
+                                           timeout=self.requests_timeout)
 
             # copia todos los atributos del diccionario hacia el objeto
             for key, value in iteritems(catalog):
@@ -242,7 +247,7 @@ def is_valid_catalog(self, catalog=None):
         Returns:
             bool: True si el data.json cumple con el schema, sino False.
         """
-        catalog = readers.read_catalog(catalog) if catalog else self
+        catalog = self._read_catalog(catalog) if catalog else self
         return self.validator.is_valid(catalog)
 
     @staticmethod
@@ -333,7 +338,7 @@ def validate_catalog(self, catalog=None, only_errors=False, fmt="dict",
             "message", "validator", "validator_value", "error_code".
 
         """
-        catalog = readers.read_catalog(catalog) if catalog else self
+        catalog = self._read_catalog(catalog) if catalog else self
 
         validation = self.validator.validate_catalog(catalog, only_errors)
         if export_path:
@@ -562,7 +567,7 @@ def catalog_report(self, catalog, harvest='none', report=None,
         """
 
         url = catalog if isinstance(catalog, string_types) else None
-        catalog = readers.read_catalog(catalog)
+        catalog = self._read_catalog(catalog)
 
         validation = self.validate_catalog(catalog)
         catalog_validation = validation["error"]["catalog"]
@@ -799,7 +804,7 @@ def generate_harvestable_catalogs(self, catalogs, harvest='all',
         if isinstance(catalogs, string_types + (dict,)):
             catalogs = [catalogs]
 
-        harvestable_catalogs = [readers.read_catalog(c) for c in catalogs]
+        harvestable_catalogs = [self._read_catalog(c) for c in catalogs]
         catalogs_urls = [catalog if isinstance(catalog, string_types)
                          else None for catalog in catalogs]
 
@@ -871,7 +876,7 @@ def generate_datasets_summary(self, catalog, export_path=None):
             list: Contiene tantos dicts como datasets estén presentes en
             `catalogs`, con los datos antes mencionados.
         """
-        catalog = readers.read_catalog(catalog)
+        catalog = self._read_catalog(catalog)
 
         # Trato de leer todos los datasets bien formados de la lista
         # catalog["dataset"], si existe.
@@ -1012,7 +1017,7 @@ def _count_fields_recursive(self, dataset, fields):
         return key_count
 
     def dataset_is_updated(self, catalog, dataset):
-        catalog = readers.read_catalog(catalog)
+        catalog = self._read_catalog(catalog)
 
         for catalog_dataset in catalog.get('dataset', []):
             if catalog_dataset.get('title') == dataset:
@@ -1093,6 +1098,11 @@ def make_catalogs_backup(self, catalogs=None,
         # TODO: implementar función
         pass
 
+    def _read_catalog(self, catalog):
+        return readers.read_catalog(catalog,
+                                    verify=self.verify_ssl,
+                                    timeout=self.requests_timeout)
+
 
 def main():
     """Permite ejecutar el módulo por línea de comandos.

diff --git a/pydatajson/readers.py b/pydatajson/readers.py
@@ -27,7 +27,7 @@
 
 import pydatajson
 from . import custom_exceptions as ce
-from . import helpers
+from . import helpers, constants
 from .ckan_reader import read_ckan_catalog
 
 import urllib3
@@ -52,7 +52,8 @@ def read_catalog_obj(catalog):
         return pydatajson.DataJson(catalog)
 
 
-def read_catalog(catalog, default_values=None, catalog_format=None):
+def read_catalog(catalog, default_values=None, catalog_format=None,
+                 verify=False, timeout=constants.REQUESTS_TIMEOUT):
     """Toma una representación cualquiera de un catálogo, y devuelve su
     representación interna (un diccionario de Python con su metadata.)
 
@@ -86,13 +87,17 @@ def read_catalog(catalog, default_values=None, catalog_format=None):
             catalog_format = catalog_format or suffix
         if catalog_format == "xlsx":
             try:
-                catalog_dict = read_xlsx_catalog(catalog)
+                catalog_dict = read_xlsx_catalog(catalog,
+                                                 verify=verify,
+                                                 timeout=timeout)
             except openpyxl_exceptions + \
                     (ValueError, AssertionError, IOError, BadZipfile) as e:
                 raise ce.NonParseableCatalog(catalog, str(e))
         elif catalog_format == "json":
             try:
-                catalog_dict = read_json(catalog)
+                catalog_dict = read_json(catalog,
+                                         verify=verify,
+                                         timeout=timeout)
             except(ValueError, TypeError, IOError) as e:
                 raise ce.NonParseableCatalog(catalog, str(e))
         elif catalog_format == "ckan":
@@ -182,7 +187,8 @@ def _set_default_value(dict_obj, keys, value):
             variable[keys[-1]] = value
 
 
-def read_json(json_path_or_url):
+def read_json(json_path_or_url, verify=False,
+              timeout=constants.REQUESTS_TIMEOUT):
     """Toma el path a un JSON y devuelve el diccionario que representa.
 
     Se asume que el parámetro es una URL si comienza con 'http' o 'https', o
@@ -200,7 +206,7 @@ def read_json(json_path_or_url):
 
     parsed_url = urlparse(json_path_or_url)
     if parsed_url.scheme in ["http", "https"]:
-        res = requests.get(json_path_or_url, verify=False)
+        res = requests.get(json_path_or_url, verify=verify, timeout=timeout)
         json_dict = json.loads(res.content, encoding='utf-8')
 
     else:
@@ -218,7 +224,8 @@ def read_json(json_path_or_url):
     return json_dict
 
 
-def read_xlsx_catalog(xlsx_path_or_url, logger=None):
+def read_xlsx_catalog(xlsx_path_or_url, logger=None, verify=False,
+                      timeout=constants.REQUESTS_TIMEOUT):
     """Toma el path a un catálogo en formato XLSX y devuelve el diccionario
     que representa.
 
@@ -238,7 +245,7 @@ def read_xlsx_catalog(xlsx_path_or_url, logger=None):
 
     parsed_url = urlparse(xlsx_path_or_url)
     if parsed_url.scheme in ["http", "https"]:
-        res = requests.get(xlsx_path_or_url, verify=False)
+        res = requests.get(xlsx_path_or_url, verify=verify, timeout=timeout)
         tmpfilename = ".tmpfile.xlsx"
         with io.open(tmpfilename, 'wb') as tmpfile:
             tmpfile.write(res.content)