[resources][m]: avoid duplication of original resources - fixes #40

datopian · Mar 14, 2018 · 38b2a2e · 38b2a2e
1 parent 374ce4b
commit 38b2a2e
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 17 deletions.
diff --git a/planner/nodes/basic_nodes.py b/planner/nodes/basic_nodes.py
@@ -57,14 +57,19 @@ def __init__(self, available_artifacts, outputs):
     def get_artifacts(self):
         for artifact in self.available_artifacts:
             if artifact.datahub_type == 'original' or artifact.datahub_type == 'source/tabular':
+                resource_name = artifact.resource_name
+                # Avoid duplication: if resource name ends with original
+                # and type is original -> is identical to source/tabular
+                if resource_name.endswith('_original') and artifact.datahub_type == 'original':
+                    continue
                 output = ProcessingArtifact(
                     artifact.datahub_type, artifact.resource_name,
                     [], [artifact],
                     [('assembler.update_resource',
                       {
-                          'name': artifact.resource_name,
+                          'name': resource_name,
                           'update': {
-                              'name': artifact.resource_name,
+                              'name': resource_name,
                               'datahub': {
                                 'type': artifact.datahub_type
                               }

diff --git a/planner/nodes/output_nodes.py b/planner/nodes/output_nodes.py
@@ -19,10 +19,13 @@ def get_artifacts(self):
                 random.randrange(1000), out_file))
             datahub_type = 'derived/{}'.format(self.fmt)
             resource_name = out_file.replace('.', '_')
-
+            # Exclude source/tabular as in zip it's duplicate of derived/csv
+            artifacts = [
+                a for a in self.available_artifacts if a.datahub_type != 'source/tabular'
+            ]
             output = ProcessingArtifact(
                 datahub_type, resource_name,
-                [], self.available_artifacts,
+                [], artifacts,
                 [('assembler.extract_readme', {}),
                  ('assembler.remove_hash', {}),
                  ('dump.to_zip', {

diff --git a/planner/nodes/planner.py b/planner/nodes/planner.py
@@ -38,7 +38,6 @@ def planner(datapackage_input, prefix, processing, outputs, allowed_types=None):
 
     # Add types for all resources
     resource_mapping = parameters.get('resource-mapping', {})
-    tabular_info = []
     for descriptor in resource_info:
         path = descriptor['path']
         name = descriptor['name']
@@ -73,21 +72,14 @@ def planner(datapackage_input, prefix, processing, outputs, allowed_types=None):
                 del descriptor['schema']
                 descriptor['geojsonSchema'] = schema
 
-        if 'schema' in descriptor:
-            tabular_descriptor = deepcopy(descriptor)
-
-            tabular_descriptor['datahub'] = {
-                'type': 'source/tabular'
-            }
-            tabular_info.append(tabular_descriptor)
-
+        descriptor['path'] = os.path.join('archive', '{}.{}'.format(name, extension))
         descriptor['datahub'] = {
             'type': 'original'
         }
-        descriptor['path'] = os.path.join('archive', '{}.{}'.format(name, extension))
-        descriptor['name'] += '_original'
-
-    resource_info.extend(tabular_info)
+        if 'schema' in descriptor:
+            descriptor['datahub'] = {
+                'type': 'source/tabular'
+            }
 
     # Processing on resources
     processed_resources = set(p['input'] for p in processing)
@@ -103,6 +95,8 @@ def planner(datapackage_input, prefix, processing, outputs, allowed_types=None):
 
         for p in processing:
             if p['input'] == ri['name']:
+                # keep original resource without processing steps for zip
+                ri['name'] += '_original'
                 ri_ = deepcopy(ri)
                 if 'tabulator' in p:
                     ri_.update(p['tabulator'])