Skip to content

Commit

Permalink
Merge 16637fe into d5fa1ad
Browse files Browse the repository at this point in the history
  • Loading branch information
lrromero committed Jul 4, 2018
2 parents d5fa1ad + 16637fe commit b364698
Show file tree
Hide file tree
Showing 11 changed files with 30,490 additions and 54 deletions.
2 changes: 1 addition & 1 deletion pydatajson/fields/fields.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"accrualPeriodicity": "requerido",
"issued": "requerido",
"modified": "recomendado",
"identifier": "optativo",
"identifier": "requerido",
"language": "optativo",
"spatial": "optativo",
"temporal": "recomendado",
Expand Down
61 changes: 16 additions & 45 deletions pydatajson/indicators.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@
import json
import os
from datetime import datetime
from collections import Counter

from six import string_types

from . import helpers
from . import readers
from .reporting import generate_datasets_summary
from .search import get_datasets, get_distributions

CENTRAL_CATALOG = "http://datos.gob.ar/data.json"
ABSOLUTE_PROJECT_DIR = os.path.dirname(os.path.abspath(__file__))
Expand Down Expand Up @@ -124,10 +126,17 @@ def _generate_indicators(catalog, validator=None, only_numeric=False):
_generate_date_indicators(catalog, only_numeric=only_numeric))
# Agrego la cuenta de los formatos de las distribuciones
if not only_numeric:
count = _count_distribution_formats(catalog)
if 'dataset' in catalog:
format_count = count_fields(get_distributions(catalog), 'format')
license_count = count_fields(get_datasets(catalog), 'license')
else:
format_count = license_count = {}

result.update({
'distribuciones_formatos_cant': count
'distribuciones_formatos_cant': format_count,
'datasets_licencias_cant': license_count,
})

# Agrego porcentaje de campos recomendados/optativos usados
fields_count = _count_required_and_optional_fields(catalog)
recomendados_pct = 100 * float(fields_count['recomendado']) / \
Expand Down Expand Up @@ -439,49 +448,6 @@ def _generate_date_indicators(catalog, tolerance=0.2, only_numeric=False):
return result


def _count_distribution_formats(catalog):
"""Cuenta los formatos especificados por el campo 'format' de cada
distribución de un catálogo o de un dataset.
Args:
catalog (str o dict): path a un catálogo, o un dict de python que
Returns:
dict: diccionario con los formatos de las distribuciones
encontradas como claves, con la cantidad de ellos en sus valores.
"""

# Leo catálogo
catalog = readers.read_catalog(catalog)
catalog_formats = {}

for dataset in catalog.get('dataset', []):
dataset_formats = _count_distribution_formats_dataset(dataset)

for distribution_format in dataset_formats:
count_catalog = catalog_formats.get(distribution_format, 0)
count_dataset = dataset_formats.get(distribution_format, 0)
catalog_formats[
distribution_format] = count_catalog + count_dataset

return catalog_formats


def _count_distribution_formats_dataset(dataset):
formats = {}
for distribution in dataset['distribution']:
# 'format' es recomendado, no obligatorio. Puede no estar.
distribution_format = distribution.get('format', None)

if distribution_format:
# Si no está en el diccionario, devuelvo 0
count = formats.get(distribution_format, 0)

formats[distribution_format] = count + 1

return formats


def _days_from_last_update(catalog, date_field="modified"):
"""Calcula días desde la última actualización del catálogo.
Expand Down Expand Up @@ -699,3 +665,8 @@ def _filter_by_likely_publisher(central_datasets, catalog_datasets):
filtered_central_datasets.append(central_dataset)

return filtered_central_datasets


def count_fields(targets, field):
"""Cuenta la cantidad de values en el key especificado de una lista de diccionarios"""
return Counter([target[field] for target in targets if field in target])
15,213 changes: 15,213 additions & 0 deletions tests/cassetes/indicators/test_license_indicators.yaml

Large diffs are not rendered by default.

15,213 changes: 15,213 additions & 0 deletions tests/cassetes/indicators/test_no_licenses_indicators.yaml

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion tests/results/minimum_data.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"dataset": [
{
"status": "OK",
"identifier": null,
"identifier": "1",
"list_index": 0,
"errors": [],
"title": "Sistema de contrataciones electrónicas"
Expand Down
1 change: 1 addition & 0 deletions tests/samples/minimum_data.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"dataset": [
{
"title": "Sistema de contrataciones electrónicas",
"identifier": "1",
"description": "Datos correspondientes al Sistema de Contrataciones Electrónicas (Argentina Compra)",
"publisher": {
"name": "Ministerio de Modernización. Secretaría de Modernización Administrativa. Oficina Nacional de Contrataciones"
Expand Down
1 change: 1 addition & 0 deletions tests/samples/missing_periodicity.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"title": "Datos Argentina",
"dataset": [
{
"identifier": "1",
"publisher": {
"name": "Ministerio de Modernización. Secretaría de Modernización Administrativa. Oficina Nacional de Contrataciones"
},
Expand Down
5 changes: 5 additions & 0 deletions tests/samples/several_datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
"superThemeTaxonomy": "http://datos.gob.ar/superThemeTaxonomy.json",
"dataset": [
{
"identifier": "1",
"license": "Creative Commons Attribution",
"publisher": {},
"description": "Datos correspondientes al Sistema de Contrataciones Electrónicas (Argentina Compra)",
"superTheme": [
Expand Down Expand Up @@ -50,6 +52,8 @@
]
},
{
"identifier": "2",
"license": "Open Data Commons Open Database License (ODbL)",
"publisher": {
"name": "Ministerio de Modernización. Secretaría de Modernización Administrativa. Oficina Nacional de Contrataciones"
},
Expand All @@ -73,6 +77,7 @@
]
},
{
"identifier": "3",
"publisher": {
"name": "Ministerio de Modernización. Secretaría de Modernización Administrativa. Oficina Nacional de Contrataciones"
},
Expand Down
3 changes: 3 additions & 0 deletions tests/samples/several_datasets_for_harvest.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"title": "Cosechando Datos Argentina",
"dataset": [
{
"identifier": "1",
"publisher": {},
"description": "Datos correspondientes al Sistema de Contrataciones Electrónicas (Argentina Compra)",
"superTheme": [
Expand Down Expand Up @@ -47,6 +48,7 @@
]
},
{
"identifier": "2",
"publisher": {
"name": "Ministerio de Modernización. Secretaría de Modernización Administrativa. Oficina Nacional de Contrataciones"
},
Expand All @@ -68,6 +70,7 @@
]
},
{
"identifier": "3",
"publisher": {
"name": "Ministerio de Modernización. Secretaría de Modernización Administrativa. Oficina Nacional de Contrataciones"
},
Expand Down
6 changes: 3 additions & 3 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,19 +553,19 @@ def test_generate_datasets_summary(self):
expected = [
OrderedDict([('indice', 0),
('titulo', 'Sistema de contrataciones electrónicas UNO'),
('identificador', None),
('identificador', '1'),
('estado_metadatos', 'ERROR'),
('cant_errores', 4),
('cant_distribuciones', 4)]),
OrderedDict([('indice', 1),
('titulo', 'Sistema de contrataciones electrónicas DOS'),
('identificador', None),
('identificador', '2'),
('estado_metadatos', 'OK'),
('cant_errores', 0),
('cant_distribuciones', 1)]),
OrderedDict([('indice', 2),
('titulo', 'Sistema de contrataciones electrónicas TRES'),
('identificador', None),
('identificador', '3'),
('estado_metadatos', 'OK'),
('cant_errores', 0),
('cant_distribuciones', 1)])]
Expand Down
37 changes: 33 additions & 4 deletions tests/test_indicators.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
record_mode='once')


class IndicatorsTestCase(object):
class TestIndicatorsTestCase(object):
SAMPLES_DIR = os.path.join("tests", "samples")
RESULTS_DIR = RESULTS_DIR
TEMP_DIR = os.path.join("tests", "temp")
Expand Down Expand Up @@ -59,7 +59,7 @@ def test_generate_catalog_indicators(self):
}

for k, v in expected.items():
assert_true(indicators[k], v)
assert_equal(indicators[k], v)

@my_vcr.use_cassette()
def test_date_indicators(self):
Expand Down Expand Up @@ -101,6 +101,29 @@ def test_format_indicators(self):
for k, v in expected.items():
assert_equal(indicators[k], v)

@my_vcr.use_cassette()
def test_license_indicators(self):
catalog = os.path.join(self.SAMPLES_DIR, "several_datasets.json")

indicators = self.dj.generate_catalogs_indicators(catalog)[0][0]

expected = {
'datasets_licencias_cant': {
'Open Data Commons Open Database License (ODbL)': 1,
'Creative Commons Attribution': 1,
}
}

for k, v in expected.items():
assert_equal(indicators[k], v)

@my_vcr.use_cassette()
def test_no_licenses_indicators(self):
# No tienen licencias
catalog = os.path.join(self.SAMPLES_DIR, "several_datasets_for_harvest.json")
indicators = self.dj.generate_catalogs_indicators(catalog)[0][0]
assert_equal(indicators['datasets_licencias_cant'], {})

@my_vcr.use_cassette()
def test_field_indicators_on_min_catalog(self):
catalog = os.path.join(self.SAMPLES_DIR, "minimum_data.json")
Expand Down Expand Up @@ -237,8 +260,13 @@ def test_network_indicators(self):
'XLSX': 1,
'PDF': 2
},
'campos_optativos_pct': 33.33,
'campos_recomendados_pct': 50.72,
'datasets_licencias_cant': {
'Open Data Commons Open Database License 1.0': 2,
'Creative Commons Attribution': 1,
'Open Data Commons Open Database License (ODbL)': 1
},
'campos_optativos_pct': 32.56,
'campos_recomendados_pct': 52.17,
}

for k, v in expected.items():
Expand Down Expand Up @@ -293,6 +321,7 @@ def test_indicators_missing_dataset(self):
'datasets_desactualizados_cant': 0,
'datasets_actualizados_pct': 0,
'distribuciones_formatos_cant': {},
'datasets_licencias_cant': {},
'datasets_frecuencia_cant': {}
}

Expand Down

0 comments on commit b364698

Please sign in to comment.