Skip to content
This repository has been archived by the owner on May 26, 2021. It is now read-only.

Commit

Permalink
Leave only the chosen algirithms
Browse files Browse the repository at this point in the history
  • Loading branch information
georgiana-b committed Aug 5, 2016
1 parent 80e922e commit a9e7eb6
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 15 deletions.
22 changes: 9 additions & 13 deletions data_quality/tasks/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(self, config, **kwargs):
self.initialize_file(self.run_file, self.run_schema.headers)
self.run_id = compat.str(uuid.uuid4().hex)
self.timestamp = datetime.now(pytz.utc)
self.all_scores = []
self.assess_timeliness = self.config['assess_timeliness']
self.timeliness_period = self.config['timeliness'].get('timeliness_period', 1)
self.max_score = 100
Expand Down Expand Up @@ -159,12 +160,13 @@ def get_pipeline_score(self, pipeline, source):
score = 0
else:
if self.scoring_algorithm == 1:
score = self.score_by_affected_rows(error_stats, total_no_rows+1)
elif self.scoring_algorithm == 2:
score = self.score_by_error_occurences(error_stats)
elif self.scoring_algorithm == 2:
score = self.score_by_affected_rows(error_stats, total_no_rows+1)
else:
score = self.score_by_affected_rows(error_stats, total_no_rows+1,
weighted=True)
raise ValueError(('The only options for "scoring_algorithm" are'
' 1 and 2.'))

if self.assess_timeliness:
publication_delay = self.get_publication_delay(source)
score -= publication_delay
Expand Down Expand Up @@ -220,26 +222,20 @@ def get_error_stats(self, report):
error['rows'].append(result['row_index'])
return error_stats

def score_by_affected_rows(self, error_stats, total_no_rows, weighted=False):
def score_by_affected_rows(self, error_stats, total_no_rows):
"""Score data source based on percent of rows affected by each error.
Algorithm: `total score - percent_of_affected_rows` or, if weighted
is True: `total score - (error weight * percent_of_affected_rows)`
Algorithm:`total score - (error weight * percent_of_affected_rows)`
Args:
error_stats: dict with stats on each error
total_no_rows: number of rows the data source has
weighted: whether or not should the weight of errors
be taken into consideration
"""

score = self.max_score
for error, stats in error_stats.items():
affected_rows = len(set(stats['rows']))
affected_rows_percent = (affected_rows * 100) / total_no_rows
if weighted:
error_impact = stats['weight'] * affected_rows_percent
else:
error_impact = affected_rows_percent
error_impact = stats['weight'] * affected_rows_percent
score -= error_impact
return score

Expand Down
4 changes: 2 additions & 2 deletions tests/tasks/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,19 +82,19 @@ def test_aggregator_assess_timeliness(self):
def test_aggregate_scoring_affected_rows(self):
"""Test Aggregator scoring based on affected_rows"""

self.config['scoring_algorithm'] = 2
aggregator_task = tasks.Aggregator(self.config)
url = 'https://raw.githubusercontent.com/frictionlessdata/goodtables/master/examples/empty_rows_multiple.csv'
pipeline_instance = pipeline.Pipeline(data=url, format='csv',
post_task=aggregator_task.run)
pipeline_instance.run()
result = self.read_file_contents(aggregator_task.result_file)[-1]

self.assertEqual(int(result['score']), 21)
self.assertEqual(int(result['score']), 0)

def tests_aggreate_scoring_occurrences(self):
"""Test Aggregator scoring based on error occurences"""

self.config['scoring_algorithm'] = 2
aggregator_task = tasks.Aggregator(self.config)
url = 'https://raw.githubusercontent.com/frictionlessdata/goodtables/master/examples/empty_rows_multiple.csv'
pipeline_instance = pipeline.Pipeline(data=url, format='csv',
Expand Down

0 comments on commit a9e7eb6

Please sign in to comment.