Skip to content
This repository has been archived by the owner on Oct 28, 2022. It is now read-only.

Commit

Permalink
Merge pull request #62 from bhagyakjain/Update_column_name
Browse files Browse the repository at this point in the history
Changing name of metric column when summary operator is applied
  • Loading branch information
arijitde-goog committed Aug 14, 2020
2 parents 0c66cb8 + 5e20b24 commit 67ff4e6
Show file tree
Hide file tree
Showing 9 changed files with 132 additions and 78 deletions.
8 changes: 7 additions & 1 deletion intents/show.py
Expand Up @@ -32,7 +32,9 @@ def show(table,**kwargs):
If the summary_operator is not None , it groups by dimensions.
If some of the optional args are None (not passed),
it is assumed that we don't have to apply them.
Also, if summary operator is applied, the name of metric column is
renamed to "<summary operator> of metric".
Args:
table: Type-pandas.dataframe
It has the contents of the csv file
Expand Down Expand Up @@ -111,6 +113,7 @@ def show(table,**kwargs):
# To groupby 'Summary Operator' column inserted
dimensions.append('Summary Operator')


after_group_by = aspects.group_by(table, dimensions, summary_operator)

table = after_group_by['table']
Expand All @@ -123,6 +126,9 @@ def show(table,**kwargs):
order = oversights_order.ORDER_IN_SHOW
suggestions = rank_oversights.rank_oversights(suggestions, order)

if summary_operator is not None:
table = aspects.update_metric_column_name(table, summary_operator, metric)

return (table , suggestions)


Expand Down
6 changes: 5 additions & 1 deletion intents/slice_compare.py
Expand Up @@ -33,6 +33,8 @@ def slice_compare(table, metric, all_dimensions, all_metric,
summary_operator, **kwargs):
""" This function returns both the results according to the intent
as well as the debiasing suggestions.
Also, if summary operator is applied, the name of metric column is
renamed to "<summary operator> of metric".
Some of the oversights considered in this intent are-
1. simpson's paradox
Args:
Expand Down Expand Up @@ -146,11 +148,13 @@ def slice_compare(table, metric, all_dimensions, all_metric,
slices = slices)
suggestions += simpsons_paradox_suggestion + top_down_error_suggestion


order = oversights_order.ORDER_IN_SLICE_COMPARE

suggestions = rank_oversights.rank_oversights(suggestions, order)

if summary_operator is not None:
result_table = aspects.update_metric_column_name(result_table, summary_operator, metric)

return (result_table, suggestions)

def _slice_compare_results(table, metric, slice_compare_column,
Expand Down
38 changes: 19 additions & 19 deletions intents/test_show.py
Expand Up @@ -71,11 +71,11 @@ def test_2():
date_column_name='date', date_format='%Y-%m-%d',
summary_operator=SummaryOperators.MEAN)
print(query_result)
expected_result = """ player_of_match win_by_runs
0 KK Nair 7
1 MM Sharma 14
2 SS Iyer 0
3 WP Saha 7"""
expected_result = """ player_of_match MEAN of win_by_runs
0 KK Nair 7
1 MM Sharma 14
2 SS Iyer 0
3 WP Saha 7"""

expected_suggestions = "[]"

Expand Down Expand Up @@ -237,8 +237,8 @@ def test_7():
date_column_name='date', date_format='%Y-%m-%d',
summary_operator=SummaryOperators.SUM)
print(query_result)
expected_result = """ Summary Operator win_by_runs
0 SUM 8702"""
expected_result = """ Summary Operator SUM of win_by_runs
0 SUM 8702"""

expected_suggestions = "[]"

Expand All @@ -259,8 +259,8 @@ def test_8():
date_column_name='date', date_format='%Y-%m-%d',
summary_operator=SummaryOperators.MEAN)
print(query_result)
expected_result = """ Summary Operator win_by_runs
0 MEAN 7"""
expected_result = """ Summary Operator MEAN of win_by_runs
0 MEAN 7"""

expected_suggestions = "[]"

Expand All @@ -277,10 +277,10 @@ def test_9():
dimensions=['Resident City'] ,
summary_operator=SummaryOperators.MEAN)
print(query_result)
expected_result = """ Resident City Salary(in $)
0 Chicago 1.658889e+05
1 Palo Alto 3.033333e+04
2 Washington 2.002740e+07"""
expected_result = """ Resident City MEAN of Salary(in $)
0 Chicago 1.658889e+05
1 Palo Alto 3.033333e+04
2 Washington 2.002740e+07"""

expected_suggestions = "[{'suggestion': 'Median is very different from the Mean', 'oversight': <Oversights.MEAN_VS_MEDIAN: 7>, 'is_row_level_suggestion': True, 'confidence_score': 3.1249999406334665, 'row_list': [{'row': 3, 'confidence_score': 3.1249999406334665}]}]"

Expand All @@ -298,13 +298,13 @@ def test_10():
dimensions=['subject'] ,
summary_operator=SummaryOperators.PROPORTION_OF_SUM)
print(query_result)
expected_result = """ subject marks
0 Social science 0.399558
1 english 0.000000
2 maths 0.200883
3 science 0.399558"""
expected_result = """ subject PROPORTION_OF_SUM of marks
0 Social science 0.399558
1 english 0.000000
2 maths 0.200883
3 science 0.399558"""

expected_suggestions = "[{'suggestion': 'There exists negative values among the values on which proportion is being applied', 'oversight': <Oversights.ATTRIBUTION_WITH_HIDDEN_NEGATIVES: 11>, 'is_row_level_suggestion': True, 'confidence_score': 1, 'row_list': [{'row': 3, 'confidence_score': 1}, {'row': 4, 'confidence_score': 1}]}]"
expected_suggestions = "[{'suggestion': 'There exists negative values among the values on which proportion is being applied', 'oversight': <Oversights.ATTRIBUTION_WITH_HIDDEN_NEGATIVES: 11>, 'is_row_level_suggestion': True, 'confidence_score': 1, 'row_list': [{'row': 2, 'confidence_score': 1}, {'row': 3, 'confidence_score': 1}]}]"

assert(expected_result == query_result[0].to_string())
assert(expected_suggestions == str(query_result[1]))
Expand Down
71 changes: 35 additions & 36 deletions intents/test_slice_compare.py
Expand Up @@ -38,9 +38,9 @@ def test_1():
)
print(query_result)

expected_result = """ season batsman_team total_runs
0 2008 Chennai Super Kings 868
1 2008 Mumbai Indians 346"""
expected_result = """ season batsman_team SUM of total_runs
0 2008 Chennai Super Kings 868
1 2008 Mumbai Indians 346"""
expected_suggestion = """[]"""

assert(expected_result == query_result[0].to_string())
Expand All @@ -60,10 +60,10 @@ def test_2():
dimensions = ['year'])
print(query_result)

expected_result = """ year Person name salary
0 2019 A 10239
1 2019 B 8190"""
expected_suggestion = "[{'suggestion': 'the relation between slices might changed a lot if you will consider month in grouping.', 'oversight': <Oversights.SIMPSONS_PARADOX: 8>, 'is_row_level_suggestion': True, 'row_list': [{'row': 1, 'confidence_score': 100}, {'row': 2, 'confidence_score': 100}]}]"
expected_result = """ year Person name SUM of salary
0 2019 A 10239
1 2019 B 8190"""
expected_suggestion = """[{'suggestion': 'the relation between slices might changed a lot if you will consider month in grouping.', 'oversight': <Oversights.SIMPSONS_PARADOX: 8>, 'is_row_level_suggestion': True, 'row_list': [{'row': 1, 'confidence_score': 100}, {'row': 2, 'confidence_score': 100}]}]"""

assert(expected_result == query_result[0].to_string())
assert(expected_suggestion == str(query_result[1]))
Expand All @@ -82,29 +82,29 @@ def test_3():
slices = [('innings', Filters.IN, ['1st', '2nd'])])
print(query_result)

expected_output = """ batsman_team innings total_runs
0 Chennai Super Kings 1st 544
1 Chennai Super Kings 2nd 324
2 Deccan Chargers 1st 40
3 Deccan Chargers 2nd 102
4 Delhi Daredevils 1st 248
5 Delhi Daredevils 2nd 342
6 Gujarat Lions 1st 100
7 Gujarat Lions 2nd 4
8 Kings XI Punjab 1st 448
9 Kings XI Punjab 2nd 522
10 Kolkata Knight Riders 1st 338
11 Kolkata Knight Riders 2nd 708
12 Mumbai Indians 1st 330
13 Mumbai Indians 2nd 16
14 Pune Warriors 1st 12
15 Pune Warriors 2nd 158
16 Rajasthan Royals 1st 368
17 Rajasthan Royals 2nd 608
18 Royal Challengers Bangalore 1st 866
19 Royal Challengers Bangalore 2nd 136
20 Sunrisers Hyderabad 1st 63
21 Sunrisers Hyderabad 2nd 331"""
expected_output = """ batsman_team innings SUM of total_runs
0 Chennai Super Kings 1st 544
1 Chennai Super Kings 2nd 324
2 Deccan Chargers 1st 40
3 Deccan Chargers 2nd 102
4 Delhi Daredevils 1st 248
5 Delhi Daredevils 2nd 342
6 Gujarat Lions 1st 100
7 Gujarat Lions 2nd 4
8 Kings XI Punjab 1st 448
9 Kings XI Punjab 2nd 522
10 Kolkata Knight Riders 1st 338
11 Kolkata Knight Riders 2nd 708
12 Mumbai Indians 1st 330
13 Mumbai Indians 2nd 16
14 Pune Warriors 1st 12
15 Pune Warriors 2nd 158
16 Rajasthan Royals 1st 368
17 Rajasthan Royals 2nd 608
18 Royal Challengers Bangalore 1st 866
19 Royal Challengers Bangalore 2nd 136
20 Sunrisers Hyderabad 1st 63
21 Sunrisers Hyderabad 2nd 331"""
expected_suggestion = """[]"""

assert(expected_output == query_result[0].to_string())
Expand Down Expand Up @@ -140,12 +140,11 @@ def test_5():
dimensions = ['class'])
print(query_result)

expected_output = """ class student_name marks
0 7th A 75
1 7th B 75
2 8th A 75
3 8th B 75"""

expected_output = """ class student_name MEAN of marks
0 7th A 75
1 7th B 75
2 8th A 75
3 8th B 75"""
expected_suggestion = "[{'suggestion': 'Median is very different from the Mean', 'oversight': <Oversights.MEAN_VS_MEDIAN: 7>, 'is_row_level_suggestion': True, 'confidence_score': -3.0792014356780038, 'row_list': [{'row': 1, 'confidence_score': -3.0792014356780038}, {'row': 2, 'confidence_score': 3.0792014356780038}, {'row': 3, 'confidence_score': 3.0792014356780038}, {'row': 4, 'confidence_score': -3.0792014356780038}]}, {'suggestion': 'Some values are similar here but will vary if we add subject for grouping ', 'oversight': <Oversights.TOP_DOWN_ERROR: 9>, 'is_row_level_suggestion': True, 'row_list': [{'row': 1, 'confidence_score': 100}, {'row': 2, 'confidence_score': 100}, {'row': 3, 'confidence_score': 100}, {'row': 4, 'confidence_score': 100}]}, {'suggestion': 'the relation between slices might changed a lot if you will consider subject in grouping.', 'oversight': <Oversights.SIMPSONS_PARADOX: 8>, 'is_row_level_suggestion': True, 'row_list': [{'row': 1, 'confidence_score': 100}, {'row': 2, 'confidence_score': 100}]}]"

assert(expected_output == query_result[0].to_string())
Expand Down
12 changes: 6 additions & 6 deletions intents/test_time_compare.py
Expand Up @@ -40,9 +40,9 @@ def test_1():
)
print(query_result[0])

expected_result = """ team_name date_of_match total_run
0 MI 01/01/2008 - 31/12/2009 776
1 MI 01/01/2010 - 31/12/2011 420"""
expected_result = """ team_name date_of_match SUM of total_run
0 MI 01/01/2008 - 31/12/2009 776
1 MI 01/01/2010 - 31/12/2011 420"""
expected_suggestions = "[]"

assert(expected_result == query_result[0].to_string())
Expand All @@ -67,9 +67,9 @@ def test_2():
dimensions = ['home_team', 'away_team', 'country'])
print(query_result)

expected_result = """ home_team away_team country date tournament
0 England Wales England 1871-11-30 - 1950-12-30 33
1 England Wales England 1950-12-31 - 2020-01-01 19"""
expected_result = """ home_team away_team country date COUNT of tournament
0 England Wales England 1871-11-30 - 1950-12-30 33
1 England Wales England 1950-12-31 - 2020-01-01 19"""
expected_suggestions = "[]"

assert(expected_result == query_result[0].to_string())
Expand Down
27 changes: 14 additions & 13 deletions intents/test_topk.py
Expand Up @@ -63,12 +63,13 @@ def test_2():
date_format='%Y-%m-%d',
summary_operator=enums.SummaryOperators.MEAN)
print(query_result)
expected_result = """ player_of_match win_by_runs
0 MM Sharma 14
1 KK Nair 7
2 WP Saha 7
3 SS Iyer 0"""
expected_result = """ player_of_match MEAN of win_by_runs
0 MM Sharma 14
1 KK Nair 7
2 WP Saha 7
3 SS Iyer 0"""
expected_suggestions = """[{'suggestion': 'Instead of 5 only 4 rows are present in the results', 'oversight': <Oversights.TOPK_WHEN_LESS_THAN_K_PRESENT: 2>}, {'oversight': <Oversights.REGRESSION_TO_THE_MEAN: 4>, 'suggestion': "very few of the top-k in the given date range will be in the previous window's top-k"}]"""

assert(expected_result == query_result[0].to_string())
assert(expected_suggestions == str(query_result[1]))

Expand All @@ -85,8 +86,8 @@ def test_3():
date_format='%Y-%m-%d',
summary_operator=enums.SummaryOperators.COUNT)
print(query_result)
expected_result = """ Creation Department_ID
0 1789 2"""
expected_result = """ Creation COUNT of Department_ID
0 1789 2"""
expected_suggestions = """[{'oversight': <Oversights.TOPK_VS_OTHERS: 6>, 'change_list': {'topKLimit': 14}, 'suggestion': 'The rows NOT in the top-k have a much larger sum over Department_ID than the rows in top-k', 'confidence_score': 0.15384615384615385}]"""
assert(expected_result == query_result[0].to_string())
assert(expected_suggestions == str(query_result[1]))
Expand Down Expand Up @@ -126,12 +127,12 @@ def test_5():
date_format='%Y-%m-%d',
summary_operator=enums.SummaryOperators.MAX)
print(query_result)
expected_result = """ city lat
0 San Francisco 37.804770
1 Redwood City 37.491269
2 Palo Alto 37.448598
3 Mountain View 37.406940
4 San Jose 37.352601"""
expected_result = """ city MAX of lat
0 San Francisco 37.804770
1 Redwood City 37.491269
2 Palo Alto 37.448598
3 Mountain View 37.406940
4 San Jose 37.352601"""
expected_suggestions = """[]"""
assert(expected_result == query_result[0].to_string())
assert(expected_suggestions == str(query_result[1]))
Expand Down
7 changes: 7 additions & 0 deletions intents/time_compare.py
Expand Up @@ -31,6 +31,10 @@ def time_compare(table, metric, all_dimensions, time_compare_column, date_range1

""" This function returns both the results according to the intent
as well as the debiasing suggestions.
Also, if summary operator is applied, the name of metric column is
renamed to "<summary operator> of metric".
Some of the oversights considered in this intent are-
Args:
table: Type-pandas.dataframe
Expand Down Expand Up @@ -126,6 +130,9 @@ def time_compare(table, metric, all_dimensions, time_compare_column, date_range1

suggestions = rank_oversights.rank_oversights(suggestions, order)

if summary_operator is not None:
result_table = aspects.update_metric_column_name(result_table, summary_operator, metric)

return (result_table, suggestions)

def _time_compare_results(table, metric, time_compare_column,
Expand Down
8 changes: 7 additions & 1 deletion intents/topk.py
Expand Up @@ -32,7 +32,10 @@
def topk(table, metric, dimensions, is_asc, k, **kwargs):
""" This function returns both the results according to the intent
as well as the debiasing suggestions.
Also, if summary operator is applied, the name of metric column is
renamed to "<summary operator> of metric".
Oversights that may be detected in top-k
1. Regression to the mean
2. Looking at tails to find causes
Expand Down Expand Up @@ -148,6 +151,9 @@ def topk(table, metric, dimensions, is_asc, k, **kwargs):
order = oversights_order.ORDER_IN_TOPK
suggestions = rank_oversights.rank_oversights(suggestions, order)

if summary_operator is not None:
result_table = aspects.update_metric_column_name(result_table, summary_operator, metric)

return (result_table, suggestions)

def topk_results(table, metric, dimensions, is_asc, k, **kwargs):
Expand Down
33 changes: 32 additions & 1 deletion intents/util/aspects.py
Expand Up @@ -473,4 +473,35 @@ def granular_time(row_date, granularity):
if granularity == enums.Granularities.ANNUALLY:
row_date = row_date.replace(second=0, minute=0, hour=0, day=1, month=1)

return row_date
return row_date

def update_metric_column_name(table, summary_operator, metric):
"""
The function updates the name of the metric column to
'<summary_operator> of metric'.
Args:
table: Type-Pandas.DataFrame
The table in which the name of metric is to be updated
summary_operator: Type-SummaryOperators enum members
It denotes the summary operator
metric: Type-string
It is the name of the column on which
summary operator is applied in case of grouping. Metric could a column
containing strings, if we are applying count operator on it.
Returns:
The table with name of metric column updated.
"""
if metric is None:
return table
# New name of metric column
updated_metric_name = '{} of {}'.format(summary_operator.name, metric)

# Create new column
table[updated_metric_name] = table[metric]

# Drop old column
table = table.drop([metric], axis=1)

return table

0 comments on commit 67ff4e6

Please sign in to comment.