Skip to content

Commit

Permalink
Merge pull request #280 from datosgobar/277-indicadores-status-downlo…
Browse files Browse the repository at this point in the history
…adURL

277 indicadores status download url
  • Loading branch information
lrromero committed Sep 24, 2019
2 parents e2bf018 + 6a10725 commit 841899f
Show file tree
Hide file tree
Showing 10 changed files with 397 additions and 74 deletions.
2 changes: 2 additions & 0 deletions pydatajson/constants.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
REQUESTS_TIMEOUT = 30
DEFAULT_TIMEZONE = "America/Buenos_Aires"

VALID_STATUS_CODES = [200, 203, 302]
File renamed without changes.
12 changes: 12 additions & 0 deletions pydatajson/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,16 @@
import tempfile

from contextlib import contextmanager

import requests
from openpyxl import load_workbook
from requests import RequestException
from six.moves.urllib_parse import urlparse

from six import string_types, iteritems
from unidecode import unidecode

from pydatajson.constants import VALID_STATUS_CODES
from pydatajson.download import download_to_file

logger = logging.getLogger('pydatajson.helpers')
Expand Down Expand Up @@ -561,3 +565,11 @@ def fields_to_uppercase(fields):
uppercase_fields[upper_key] = lowercase_counts + uppercase_counts

return uppercase_fields


def is_working_url(url):
try:
response = requests.head(url, timeout=3)
return response.status_code in VALID_STATUS_CODES
except RequestException:
return False
55 changes: 17 additions & 38 deletions pydatajson/indicators.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
from pydatajson.helpers import fields_to_uppercase
from . import helpers
from . import readers
from .indicator_generators import FederationIndicatorsGenerator
from .reporting import generate_datasets_summary
from .federation_indicators_generator import FederationIndicatorsGenerator
from pydatajson.status_indicators_generator import StatusIndicatorsGenerator
from .search import get_datasets, get_distributions

CENTRAL_CATALOG = "http://datos.gob.ar/data.json"
Expand Down Expand Up @@ -286,49 +286,28 @@ def _generate_status_indicators(catalog, validator=None):
'datasets_con_datos_pct': None
}
try:
summary = generate_datasets_summary(catalog, validator=validator)
generator = StatusIndicatorsGenerator(catalog, validator=validator)
except Exception as e:
msg = u'Error generando resumen del catálogo {}: {}'.format(
catalog['title'], str(e))
logger.warning(msg)
return result

cant_ok = 0
cant_error = 0
cant_data = 0
cant_without_data = 0
cant_distribuciones = 0
datasets_total = len(summary)
for dataset in summary:
cant_distribuciones += dataset['cant_distribuciones']

# chequea si el dataset tiene datos
if dataset['tiene_datos'] == "SI":
cant_data += 1
else: # == "ERROR"
cant_without_data += 1

# chequea estado de los metadatos
if dataset['estado_metadatos'] == "OK":
cant_ok += 1
else: # == "ERROR"
cant_error += 1

datasets_ok_pct = 0
datasets_with_data_pct = 0
if datasets_total:
datasets_ok_pct = round(float(cant_ok) / datasets_total, 4)
datasets_with_data_pct = round(float(cant_data) / datasets_total, 4)

result.update({
'datasets_cant': datasets_total,
'distribuciones_cant': cant_distribuciones,
'datasets_meta_ok_cant': cant_ok,
'datasets_meta_error_cant': cant_error,
'datasets_meta_ok_pct': datasets_ok_pct,
'datasets_con_datos_cant': cant_data,
'datasets_sin_datos_cant': cant_without_data,
'datasets_con_datos_pct': datasets_with_data_pct
'datasets_cant': generator.datasets_cant(),
'distribuciones_cant': generator.distribuciones_cant(),
'datasets_meta_ok_cant': generator.datasets_meta_ok_cant(),
'datasets_meta_error_cant': generator.datasets_meta_error_cant(),
'datasets_meta_ok_pct': generator.datasets_meta_ok_pct(),
'datasets_con_datos_cant': generator.datasets_con_datos_cant(),
'datasets_sin_datos_cant': generator.datasets_sin_datos_cant(),
'datasets_con_datos_pct': generator.datasets_con_datos_pct(),
'distribuciones_download_url_ok_cant':
generator.distribuciones_download_url_ok_cant(),
'distribuciones_download_url_error_cant':
generator.distribuciones_download_url_error_cant(),
'distribuciones_download_url_ok_pct':
generator.distribuciones_download_url_ok_pct(),

})
return result
Expand Down
66 changes: 66 additions & 0 deletions pydatajson/status_indicators_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from pydatajson.helpers import is_working_url
from pydatajson.readers import read_catalog
from pydatajson.reporting import generate_datasets_summary


class StatusIndicatorsGenerator(object):

def __init__(self, catalog, validator=None):
self.download_url_ok = None
self.catalog = read_catalog(catalog)
self.summary = generate_datasets_summary(self.catalog,
validator=validator)

def datasets_cant(self):
return len(self.summary)

def distribuciones_cant(self):
return sum(ds['cant_distribuciones'] for ds in self.summary)

def datasets_meta_ok_cant(self):
return sum(ds['estado_metadatos'] == 'OK' for ds in self.summary)

def datasets_meta_error_cant(self):
return sum(ds['estado_metadatos'] == 'ERROR' for ds in self.summary)

def datasets_meta_ok_pct(self):
return self._get_dataset_percentage(self.datasets_meta_ok_cant)

def datasets_con_datos_cant(self):
return sum(ds['tiene_datos'] == 'SI' for ds in self.summary)

def datasets_sin_datos_cant(self):
return sum(ds['tiene_datos'] == 'NO' for ds in self.summary)

def datasets_con_datos_pct(self):
return self._get_dataset_percentage(self.datasets_con_datos_cant)

def distribuciones_download_url_ok_cant(self):
return self.download_url_ok or self._validate_download_urls()

def distribuciones_download_url_error_cant(self):
return self.distribuciones_cant() - \
self.distribuciones_download_url_ok_cant()

def distribuciones_download_url_ok_pct(self):
total = self.distribuciones_cant()
if not total:
return None
return \
round(float(self.distribuciones_download_url_ok_cant()) / total, 4)

def _validate_download_urls(self):
result = 0
for dataset in self.catalog.get('dataset', []):
for distribution in dataset.get('distribution', []):
valid = is_working_url(distribution.get('downloadURL', ''))
result += valid
# Guardo el resultado una vez calculado
self.download_url_ok = result
return result

def _get_dataset_percentage(self, indicator):
total = self.datasets_cant()
if not total:
return None
return round(float(indicator()) / total, 4)
1 change: 1 addition & 0 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ pycallgraph
setuptools>=38.6
wheel>=0.31
vcrpy
requests_mock
160 changes: 160 additions & 0 deletions tests/status_indicators_generator_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# -*- coding: utf-8 -*-

import os.path
import unittest
import re

import requests_mock

from pydatajson.status_indicators_generator import StatusIndicatorsGenerator

try:
import mock
except ImportError:
from unittest import mock


class TestStatusIndicatorsGeneratorTestCase(unittest.TestCase):
SAMPLES_DIR = os.path.join("tests", "samples")

@classmethod
def get_sample(cls, sample_filename):
return os.path.join(cls.SAMPLES_DIR, sample_filename)

@classmethod
def setUpClass(cls):
cls.gen_justicia = StatusIndicatorsGenerator(
cls.get_sample('catalogo_justicia.json'))
cls.gen_full_data = StatusIndicatorsGenerator(
cls.get_sample('full_data.json'))
cls.gen_empty = StatusIndicatorsGenerator(
cls.get_sample('invalid_catalog_empty.json'))

def test_just_datasets_cant(self):
self.assertEqual(16, self.gen_justicia.datasets_cant())

def test_full_datasets_cant(self):
self.assertEqual(2, self.gen_full_data.datasets_cant())

def test_empty_datasets_cant(self):
self.assertEqual(0, self.gen_empty.datasets_cant())

def test_just_distribuciones_cant(self):
self.assertEqual(56, self.gen_justicia.distribuciones_cant())

def test_full_distribuciones_cant(self):
self.assertEqual(2, self.gen_full_data.distribuciones_cant())

def test_empty_distribuciones_cant(self):
self.assertEqual(0, self.gen_empty.distribuciones_cant())

def test_just_datasets_meta_ok_cant(self):
self.assertEqual(15, self.gen_justicia.datasets_meta_ok_cant())

def test_full_datasets_meta_ok_cant(self):
self.assertEqual(2, self.gen_full_data.datasets_meta_ok_cant())

def test_empty_datasets_meta_ok_cant(self):
self.assertEqual(0, self.gen_empty.datasets_meta_ok_cant())

def test_just_datasets_meta_error_cant(self):
self.assertEqual(1, self.gen_justicia.datasets_meta_error_cant())

def test_full_datasets_meta_error_cant(self):
self.assertEqual(0, self.gen_full_data.datasets_meta_error_cant())

def test_empty_datasets_meta_error_cant(self):
self.assertEqual(0, self.gen_empty.datasets_meta_error_cant())

def test_just_datasets_meta_ok_pct(self):
self.assertEqual(0.9375, self.gen_justicia.datasets_meta_ok_pct())

def test_full_datasets_meta_ok_pct(self):
self.assertEqual(1.0, self.gen_full_data.datasets_meta_ok_pct())

def test_empty_datasets_meta_ok_pct(self):
self.assertEqual(None, self.gen_empty.datasets_meta_ok_pct())

def test_just_datasets_con_datos_cant(self):
self.assertEqual(16, self.gen_justicia.datasets_con_datos_cant())

def test_full_datasets_con_datos_cant(self):
self.assertEqual(1, self.gen_full_data.datasets_con_datos_cant())

def test_empty_datasets_con_datos_cant(self):
self.assertEqual(0, self.gen_empty.datasets_con_datos_cant())

def test_just_datasets_sin_datos_cant(self):
self.assertEqual(0, self.gen_justicia.datasets_sin_datos_cant())

def test_full_datasets_sin_datos_cant(self):
self.assertEqual(1, self.gen_full_data.datasets_sin_datos_cant())

def test_empty_datasets_sin_datos_cant(self):
self.assertEqual(0, self.gen_empty.datasets_sin_datos_cant())

def test_just_datasets_con_datos_pct(self):
self.assertEqual(1, self.gen_justicia.datasets_con_datos_pct())

def test_full_datasets_con_datos_pct(self):
self.assertEqual(0.5, self.gen_full_data.datasets_con_datos_pct())

def test_empty_datasets_con_datos_pct(self):
self.assertEqual(None, self.gen_empty.datasets_con_datos_pct())

@requests_mock.Mocker()
def test_just_distribuciones_download_url_ok_cant(self, req_mock):
req_mock.head(requests_mock.ANY, text='resp')
self.assertEqual(
56, self.gen_justicia.distribuciones_download_url_ok_cant())

@requests_mock.Mocker()
def test_full_distribuciones_download_url_ok_cant(self, req_mock):
req_mock.head(re.compile('/convocatorias-abiertas-anio-2015.pdf'),
status_code=404)
req_mock.head(re.compile('/convocatorias-abiertas-anio-2015.csv'),
status_code=200)
self.assertEqual(
1, self.gen_full_data.distribuciones_download_url_ok_cant())

def test_empty_distribuciones_download_url_ok_cant(self):
self.assertEqual(
0, self.gen_empty.distribuciones_download_url_ok_cant())

@requests_mock.Mocker()
def test_just_distribuciones_download_url_error_cant(self, req_mock):
req_mock.head(requests_mock.ANY, text='resp')
self.assertEqual(
0, self.gen_justicia.distribuciones_download_url_error_cant())

@requests_mock.Mocker()
def test_full_distribuciones_download_url_error_cant(self, req_mock):
req_mock.head(re.compile('/convocatorias-abiertas-anio-2015.pdf'),
status_code=404)
req_mock.head(re.compile('/convocatorias-abiertas-anio-2015.csv'),
status_code=200)
self.assertEqual(
1, self.gen_full_data.distribuciones_download_url_error_cant())

def test_empty_distribuciones_download_url_error_cant(self):
self.assertEqual(
0, self.gen_empty.distribuciones_download_url_error_cant())

@requests_mock.Mocker()
def test_just_distribuciones_download_url_ok_pct(self, req_mock):
req_mock.head(requests_mock.ANY, text='resp')
self.assertEqual(
1, self.gen_justicia.distribuciones_download_url_ok_pct())

@requests_mock.Mocker()
def test_full_distribuciones_download_url_ok_pct(self, req_mock):
req_mock.head(re.compile('/convocatorias-abiertas-anio-2015.pdf'),
status_code=404)
req_mock.head(re.compile('/convocatorias-abiertas-anio-2015.csv'),
status_code=200)
self.assertEqual(
0.5, self.gen_full_data.distribuciones_download_url_ok_pct())

def test_empty_distribuciones_download_url_ok_pct(self):
self.assertEqual(
None, self.gen_empty.distribuciones_download_url_ok_pct())
12 changes: 9 additions & 3 deletions tests/test_catalog_readme.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,19 @@ def setUp(cls):
cls.catalog = cls.get_sample("several_datasets_for_harvest.json")

@my_vcr.use_cassette()
def test_generate_readme(self):
@mock.patch('pydatajson.status_indicators_generator.is_working_url',
return_value=True)
def test_generate_readme(self, _mock_check):
with io.open(os.path.join(self.RESULTS_DIR, "catalog_readme.md"), 'r',
encoding='utf-8') as expected_readme_file:
expected_readme = expected_readme_file.read()
readme = generate_readme(self.catalog)
assert_equal(expected_readme, readme)

@my_vcr.use_cassette()
def test_readme_file_write(self):
@mock.patch('pydatajson.status_indicators_generator.is_working_url',
return_value=True)
def test_readme_file_write(self, _mock_check):
actual_filename = os.path.join(self.TEMP_DIR, "catalog_readme.md")
expected_filename = os.path.join(self.RESULTS_DIR, "catalog_readme.md")
generate_readme(self.catalog, export_path=actual_filename)
Expand All @@ -61,8 +65,10 @@ def test_readme_file_write(self):
assert_true(comparison)

@my_vcr.use_cassette()
@mock.patch('pydatajson.status_indicators_generator.is_working_url',
return_value=True)
@mock.patch('pydatajson.indicators._federation_indicators')
def test_readme_null_indicators(self, mock_indicators):
def test_readme_null_indicators(self, mock_indicators, _mock_check):
mock_indicators.return_value = {
'datasets_federados_cant': None,
'datasets_federados_pct': None,
Expand Down

0 comments on commit 841899f

Please sign in to comment.