Skip to content

Commit

Permalink
Merge 81fa141 into 9eec443
Browse files Browse the repository at this point in the history
  • Loading branch information
lrromero committed Apr 1, 2019
2 parents 9eec443 + 81fa141 commit 59c5582
Show file tree
Hide file tree
Showing 8 changed files with 994 additions and 174 deletions.
6 changes: 4 additions & 2 deletions pydatajson/core.py
Expand Up @@ -1063,10 +1063,12 @@ def _extract_datasets_to_harvest(cls, report):
return datasets_to_harvest

def generate_catalogs_indicators(self, catalogs=None,
central_catalog=None):
central_catalog=None,
identifier_search=False):
catalogs = catalogs or self
return indicators.generate_catalogs_indicators(
catalogs, central_catalog, validator=self.validator)
catalogs, central_catalog, identifier_search=identifier_search,
validator=self.validator)

def _count_fields_recursive(self, dataset, fields):
"""Cuenta la información de campos optativos/recomendados/requeridos
Expand Down
103 changes: 103 additions & 0 deletions pydatajson/helpers.py
Expand Up @@ -441,3 +441,106 @@ def is_local_andino_resource(catalog, distribution):
if homepage is not None:
return distribution.get('downloadURL', '').startswith(homepage)
return False


def datasets_equal(dataset, other, fields_dataset=None,
fields_distribution=None, return_diff=False):
"""Función de igualdad de dos datasets: se consideran iguales si
los valores de los campos 'title', 'publisher.name',
'accrualPeriodicity' e 'issued' son iguales en ambos.
Args:
dataset (dict): un dataset, generado por la lectura de un catálogo
other (dict): idem anterior
Returns:
bool: True si son iguales, False en caso contrario
"""
dataset_is_equal = True
dataset_diff = []

# Campos a comparar. Si es un campo anidado escribirlo como lista
if not fields_dataset:
fields_dataset = [
'title',
['publisher', 'name']
]

for field_dataset in fields_dataset:
if isinstance(field_dataset, list):
value = traverse_dict(dataset, field_dataset)
other_value = traverse_dict(other, field_dataset)
else:
value = dataset.get(field_dataset)
other_value = other.get(field_dataset)

if value != other_value:
dataset_diff.append({
"error_location": field_dataset,
"dataset_value": value,
"other_value": other_value
})
dataset_is_equal = False

if fields_distribution:
dataset_distributions = dataset.get("distribution")
other_distributions = other.get("distribution")

if len(dataset_distributions) != len(other_distributions):
logger.info("{} distribuciones en origen y {} en destino".format(
len(dataset_distributions), len(other_distributions)))
dataset_is_equal = False

distributions_equal = True
for dataset_distribution, other_distribution in zip(
dataset_distributions, other_distributions):

for field_distribution in fields_distribution:
if isinstance(field_distribution, list):
value = traverse_dict(
dataset_distribution, field_distribution)
other_value = traverse_dict(
other_distribution, field_distribution)
else:
value = dataset_distribution.get(field_distribution)
other_value = other_distribution.get(field_distribution)

if value != other_value:
dataset_diff.append({
"error_location": "{} ({})".format(
field_distribution,
dataset_distribution.get("title")
),
"dataset_value": value,
"other_value": other_value
})
distributions_equal = False

if not distributions_equal:
dataset_is_equal = False

if return_diff:
return dataset_diff
else:
return dataset_is_equal


def filter_by_likely_publisher(central_datasets, catalog_datasets):
publisher_names = [
catalog_dataset["publisher"]["name"]
for catalog_dataset in catalog_datasets
if "name" in catalog_dataset.get("publisher", {})
]

filtered_central_datasets = []
for central_dataset in central_datasets:
if "name" in central_dataset["publisher"] and \
central_dataset["publisher"]["name"] in publisher_names:
filtered_central_datasets.append(central_dataset)

return filtered_central_datasets


def title_in_dataset_list(dataset, dataset_list):
return (dataset.get('title'), dataset.get('landingPage')) \
in dataset_list
154 changes: 154 additions & 0 deletions pydatajson/indicator_generators.py
@@ -0,0 +1,154 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""Módulo 'indicators' de Pydatajson
Contiene los métodos para monitorear y generar indicadores de un catálogo o de
una red de catálogos.
"""

from __future__ import print_function, absolute_import
from __future__ import unicode_literals, with_statement

from pydatajson.readers import read_catalog
from pydatajson.helpers import datasets_equal, filter_by_likely_publisher
from pydatajson.helpers import title_in_dataset_list


class FederationIndicatorsGenerator(object):
def __init__(self, central_catalog, catalog, id_based=False):
if id_based:
calculator = IdBasedIndicatorCalculator
else:
calculator = TitleBasedIndicatorCalculator

self.calculator = calculator(central_catalog, catalog)

def datasets_federados(self):
return self.calculator.datasets_federados()

def datasets_federados_cant(self):
return len(self.calculator.datasets_federados())

def datasets_no_federados(self):
return self.calculator.datasets_no_federados()

def datasets_no_federados_cant(self):
return len(self.calculator.datasets_no_federados())

def distribuciones_federadas_cant(self):
return self.calculator.distribuciones_federadas_cant()

def datasets_federados_eliminados(self):
return self.calculator.datasets_federados_eliminados()

def datasets_federados_eliminados_cant(self):
return len(self.datasets_federados_eliminados())

def datasets_federados_pct(self):
federados = self.datasets_federados_cant()
no_federados = self.datasets_no_federados_cant()
if federados or no_federados: # Evita división por 0
federados_pct = float(federados) / (federados + no_federados)
else:
federados_pct = 0
return round(federados_pct, 4)


class AbstractCalculator(object):
def __init__(self, central_catalog, catalog):
self.central_catalog = read_catalog(central_catalog)
self.catalog = read_catalog(catalog)
self.filtered_central = filter_by_likely_publisher(
self.central_catalog.get('dataset', []),
self.catalog.get('dataset', []))

def datasets_federados(self):
raise NotImplementedError

def datasets_no_federados(self):
raise NotImplementedError

def datasets_federados_eliminados(self):
raise NotImplementedError

def distribuciones_federadas_cant(self):
raise NotImplementedError


class IdBasedIndicatorCalculator(AbstractCalculator):
def __init__(self, central_catalog, catalog):
super(IdBasedIndicatorCalculator, self).__init__(central_catalog,
catalog)
self.central_datasets = {ds['identifier'] for ds in
self.central_catalog.get('dataset', [])}
self.catalog_datasets = {catalog['identifier'] + '_' + ds['identifier']
for ds in catalog.get('dataset', [])}
self.federated_ids = self.catalog_datasets & self.central_datasets

def distribuciones_federadas_cant(self):
return sum([len(ds.get('distribution', [])) for ds in
self.central_catalog.get('dataset', []) if
ds['identifier'] in self.federated_ids])

def datasets_federados_eliminados(self):
return [(ds.get('title'), ds.get('landingPage')) for ds in
self.filtered_central if ds['identifier'] not in
self.federated_ids]

def datasets_no_federados(self):
return [(ds.get('title'), ds.get('landingPage')) for
ds in self.catalog.get('dataset', []) if
self.catalog['identifier'] + '_' + ds['identifier']
not in self.federated_ids]

def datasets_federados(self):
return [(ds.get('title'), ds.get('landingPage')) for
ds in self.catalog.get('dataset', []) if
self.catalog['identifier'] + '_' + ds['identifier']
in self.federated_ids]


class TitleBasedIndicatorCalculator(AbstractCalculator):

def __init__(self, central_catalog, catalog):
super(TitleBasedIndicatorCalculator, self).__init__(central_catalog,
catalog)

def datasets_federados(self):
datasets_federados = []
for dataset in self.catalog.get('dataset', []):
for central_dataset in self.central_catalog.get('dataset', []):
if (datasets_equal(dataset, central_dataset) and not
title_in_dataset_list(dataset, datasets_federados)):
datasets_federados.append((dataset.get('title'),
dataset.get('landingPage')))
return datasets_federados

def datasets_no_federados(self):
datasets_federados = self.datasets_federados()
datasets_no_federados = []
for dataset in self.catalog.get('dataset', []):
if not title_in_dataset_list(dataset, datasets_federados):
datasets_no_federados.append((dataset.get('title'),
dataset.get('landingPage')))
return datasets_no_federados

def datasets_federados_eliminados(self):
datasets_federados = self.datasets_federados()
datasets_federados_eliminados = []
for central_dataset in self.filtered_central:
if not title_in_dataset_list(central_dataset, datasets_federados):
datasets_federados_eliminados.append(
(central_dataset.get('title'),
central_dataset.get('landingPage'))
)
return datasets_federados_eliminados

def distribuciones_federadas_cant(self):
datasets_federados = self.datasets_federados()
distribuciones_federadas = 0
for dataset in self.catalog.get('dataset', []):
if title_in_dataset_list(dataset, datasets_federados):
distribuciones_federadas += len(dataset['distribution'])
return distribuciones_federadas

0 comments on commit 59c5582

Please sign in to comment.