Skip to content
This repository has been archived by the owner on May 26, 2021. It is now read-only.

Commit

Permalink
Feature/improve ckan generator (#34)
Browse files Browse the repository at this point in the history
* Add default publisher for sources without org
* Use default_schema_package for sources
  • Loading branch information
georgiana-b authored and pwalsh committed Aug 12, 2016
1 parent ec8c0a5 commit 21a4340
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 8 deletions.
30 changes: 23 additions & 7 deletions data_quality/generators/ckan.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from __future__ import unicode_literals

import csv
import requests
from os import path
import requests
import jsontableschema
from data_quality import compat, utilities
from .base import BaseGenerator
Expand All @@ -22,6 +22,7 @@ def __init__(self, url=None, datapackage=None):
"""

super(CkanGenerator, self).__init__(url, datapackage)
self.default_publisher = None

def generate_sources(self, sources_filepath, file_types=['csv', 'excel']):
"""Generates sources_file from the url"""
Expand Down Expand Up @@ -55,13 +56,22 @@ def get_sources(self):
response.raise_for_status()
data = response.json()
count = data['result']['count']
all_data = []
all_packages = []
all_sources = []
for start in range(0, count, 500):
payload = {'rows': 500, 'start': start}
response = requests.get(full_url, params=payload)
data = response.json()
all_data += data['result']['results']
return all_data
all_packages += [result['id'] for result in data['result']['results']]

for package_id in all_packages:
ext = 'api/3/action/package_show'
full_package_url = compat.urljoin(self.base_url, ext)
package_payload = {'use_default_schema': True, 'id': package_id}
response = requests.get(full_package_url, params=package_payload)
data = response.json()
all_sources.append(data['result'])
return all_sources

def extract_sources(self, datum, file_types):
"""Extract all sources for one result"""
Expand All @@ -75,8 +85,13 @@ def extract_sources(self, datum, file_types):
file_types = ['excel' if ext in ['xls', 'xlsx'] else ext for ext in file_types]
file_types.append('')
if new_resource['format'] in file_types:
publisher = datum.get('organization', {})
new_resource['publisher_id'] = publisher.get('name')
publisher = datum.get('organization', None)
if publisher:
new_resource['publisher_id'] = publisher.get('name')
else:
self.default_publisher = {'name': 'no_organization',
'display_name': 'No Organization'}
new_resource['publisher_id'] = self.default_publisher['name']
new_resource['id'] = resource['id']
new_resource['created_at'] = resource['created']
title = datum.get('title', '')
Expand All @@ -89,6 +104,8 @@ def generate_publishers(self, publishers_filepath):
"""Generates publisher_file from the url"""

results = self.get_publishers()
if self.default_publisher:
results.append(self.default_publisher)
pub_resource = utilities.get_datapackage_resource(publishers_filepath,
self.datapackage)
pub_schema = jsontableschema.model.SchemaModel(pub_resource.descriptor['schema'])
Expand Down Expand Up @@ -133,4 +150,3 @@ def extract_publisher(self, result):
if key == 'category':
publisher['type'] = extra.get('value')
return publisher

2 changes: 1 addition & 1 deletion data_quality/tasks/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ def run(self, generator_name, endpoint, generator_path, file_types, simulate=Fal
if simulate:
return generator

generator.generate_publishers(self.publisher_file)
generator.generate_sources(self.source_file, file_types=file_types)
generator.generate_publishers(self.publisher_file)

def update_datapackage_sources(self):
"""Update the 'sources' property of datapackage with the new sources"""
Expand Down

0 comments on commit 21a4340

Please sign in to comment.