Leave only the chosen algirithms

frictionlessdata · Aug 5, 2016 · a9e7eb6 · a9e7eb6
1 parent 80e922e
commit a9e7eb6
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 15 deletions.
diff --git a/data_quality/tasks/aggregate.py b/data_quality/tasks/aggregate.py
@@ -38,6 +38,7 @@ def __init__(self, config, **kwargs):
         self.initialize_file(self.run_file, self.run_schema.headers)
         self.run_id = compat.str(uuid.uuid4().hex)
         self.timestamp = datetime.now(pytz.utc)
+        self.all_scores = []
         self.assess_timeliness = self.config['assess_timeliness']
         self.timeliness_period = self.config['timeliness'].get('timeliness_period', 1)
         self.max_score = 100
@@ -159,12 +160,13 @@ def get_pipeline_score(self, pipeline, source):
             score = 0
         else:
             if self.scoring_algorithm == 1:
-                score = self.score_by_affected_rows(error_stats, total_no_rows+1)
-            elif self.scoring_algorithm == 2:
                 score = self.score_by_error_occurences(error_stats)
+            elif self.scoring_algorithm == 2:
+                score = self.score_by_affected_rows(error_stats, total_no_rows+1)
             else:
-                score = self.score_by_affected_rows(error_stats, total_no_rows+1,
-                                                    weighted=True)
+                raise ValueError(('The only options for "scoring_algorithm" are'
+                                  ' 1 and 2.'))
+
             if self.assess_timeliness:
                 publication_delay = self.get_publication_delay(source)
                 score -= publication_delay
@@ -220,26 +222,20 @@ def get_error_stats(self, report):
                     error['rows'].append(result['row_index'])
         return error_stats
 
-    def score_by_affected_rows(self, error_stats, total_no_rows, weighted=False):
+    def score_by_affected_rows(self, error_stats, total_no_rows):
         """Score data source based on percent of rows affected by each error.
-           Algorithm: `total score - percent_of_affected_rows` or, if weighted
-            is True: `total score - (error weight * percent_of_affected_rows)`
+           Algorithm:`total score - (error weight * percent_of_affected_rows)`
 
            Args:
                 error_stats: dict with stats on each error
                 total_no_rows: number of rows the data source has
-                weighted: whether or not should the weight of errors
-                           be taken into consideration
         """
 
         score = self.max_score
         for error, stats in error_stats.items():
             affected_rows = len(set(stats['rows']))
             affected_rows_percent = (affected_rows * 100) / total_no_rows
-            if weighted:
-                error_impact = stats['weight'] * affected_rows_percent
-            else:
-                error_impact = affected_rows_percent
+            error_impact = stats['weight'] * affected_rows_percent
             score -= error_impact
         return score
 

diff --git a/tests/tasks/test_aggregate.py b/tests/tasks/test_aggregate.py
@@ -82,19 +82,19 @@ def test_aggregator_assess_timeliness(self):
     def test_aggregate_scoring_affected_rows(self):
         """Test Aggregator scoring based on affected_rows"""
 
+        self.config['scoring_algorithm'] = 2
         aggregator_task = tasks.Aggregator(self.config)
         url = 'https://raw.githubusercontent.com/frictionlessdata/goodtables/master/examples/empty_rows_multiple.csv'
         pipeline_instance = pipeline.Pipeline(data=url, format='csv',
                                               post_task=aggregator_task.run)
         pipeline_instance.run()
         result = self.read_file_contents(aggregator_task.result_file)[-1]
 
-        self.assertEqual(int(result['score']), 21)
+        self.assertEqual(int(result['score']), 0)
 
     def tests_aggreate_scoring_occurrences(self):
         """Test Aggregator scoring based on error occurences"""
 
-        self.config['scoring_algorithm'] = 2
         aggregator_task = tasks.Aggregator(self.config)
         url = 'https://raw.githubusercontent.com/frictionlessdata/goodtables/master/examples/empty_rows_multiple.csv'
         pipeline_instance = pipeline.Pipeline(data=url, format='csv',