Merge pull request #62 from bhagyakjain/Update_column_name

Changing name of metric column when summary operator is applied
google · Aug 14, 2020 · 67ff4e6 · 67ff4e6
2 parents 0c66cb8 + 5e20b24
commit 67ff4e6
Show file tree

Hide file tree

Showing 9 changed files with 132 additions and 78 deletions.
diff --git a/intents/show.py b/intents/show.py
@@ -32,7 +32,9 @@ def show(table,**kwargs):
     If the summary_operator is not None , it groups by dimensions.
     If some of the optional args are None (not passed),
     it is assumed that we don't have to apply them.
-
+    
+    Also, if summary operator is applied, the name of metric column is
+    renamed to "<summary operator> of metric".
     Args:
         table: Type-pandas.dataframe
             It has the contents of the csv file
@@ -111,6 +113,7 @@ def show(table,**kwargs):
         # To groupby 'Summary Operator' column inserted
         dimensions.append('Summary Operator')
 
+
     after_group_by = aspects.group_by(table, dimensions, summary_operator)
 
     table = after_group_by['table']
@@ -123,6 +126,9 @@ def show(table,**kwargs):
     order = oversights_order.ORDER_IN_SHOW
     suggestions = rank_oversights.rank_oversights(suggestions, order)
 
+    if summary_operator is not None:
+        table = aspects.update_metric_column_name(table, summary_operator, metric)
+
     return (table , suggestions)
 
 

diff --git a/intents/slice_compare.py b/intents/slice_compare.py
@@ -33,6 +33,8 @@ def slice_compare(table, metric, all_dimensions, all_metric,
                                summary_operator, **kwargs):
     """ This function returns both the results according to the intent
     as well as the debiasing suggestions.
+    Also, if summary operator is applied, the name of metric column is
+    renamed to "<summary operator> of metric".
     Some of the oversights considered in this intent are-
     1. simpson's paradox
     Args:
@@ -146,11 +148,13 @@ def slice_compare(table, metric, all_dimensions, all_metric,
                                                slices = slices)
     suggestions += simpsons_paradox_suggestion + top_down_error_suggestion
 
-
     order = oversights_order.ORDER_IN_SLICE_COMPARE
 
     suggestions = rank_oversights.rank_oversights(suggestions, order)
 
+    if summary_operator is not None:
+        result_table = aspects.update_metric_column_name(result_table, summary_operator, metric)
+
     return (result_table, suggestions)
 
 def _slice_compare_results(table, metric, slice_compare_column,

diff --git a/intents/test_show.py b/intents/test_show.py
@@ -71,11 +71,11 @@ def test_2():
                                 date_column_name='date', date_format='%Y-%m-%d',
                                 summary_operator=SummaryOperators.MEAN)
     print(query_result)
-    expected_result = """  player_of_match  win_by_runs
-0         KK Nair            7
-1       MM Sharma           14
-2         SS Iyer            0
-3         WP Saha            7"""
+    expected_result = """  player_of_match  MEAN of win_by_runs
+0         KK Nair                    7
+1       MM Sharma                   14
+2         SS Iyer                    0
+3         WP Saha                    7"""
 
     expected_suggestions = "[]"
 
@@ -237,8 +237,8 @@ def test_7():
                                 date_column_name='date', date_format='%Y-%m-%d',
                                 summary_operator=SummaryOperators.SUM)
     print(query_result)
-    expected_result = """  Summary Operator  win_by_runs
-0              SUM         8702"""
+    expected_result = """  Summary Operator  SUM of win_by_runs
+0              SUM                8702"""
 
     expected_suggestions = "[]"
 
@@ -259,8 +259,8 @@ def test_8():
                                 date_column_name='date', date_format='%Y-%m-%d',
                                 summary_operator=SummaryOperators.MEAN)
     print(query_result)
-    expected_result = """  Summary Operator  win_by_runs
-0             MEAN            7"""
+    expected_result = """  Summary Operator  MEAN of win_by_runs
+0             MEAN                    7"""
 
     expected_suggestions = "[]"
 
@@ -277,10 +277,10 @@ def test_9():
                                 dimensions=['Resident City'] ,
                                 summary_operator=SummaryOperators.MEAN)
     print(query_result)
-    expected_result = """  Resident City  Salary(in $)
-0       Chicago  1.658889e+05
-1     Palo Alto  3.033333e+04
-2    Washington  2.002740e+07"""
+    expected_result = """  Resident City  MEAN of Salary(in $)
+0       Chicago          1.658889e+05
+1     Palo Alto          3.033333e+04
+2    Washington          2.002740e+07"""
 
     expected_suggestions = "[{'suggestion': 'Median is very different from the Mean', 'oversight': <Oversights.MEAN_VS_MEDIAN: 7>, 'is_row_level_suggestion': True, 'confidence_score': 3.1249999406334665, 'row_list': [{'row': 3, 'confidence_score': 3.1249999406334665}]}]"
 
@@ -298,13 +298,13 @@ def test_10():
                                 dimensions=['subject'] ,
                                 summary_operator=SummaryOperators.PROPORTION_OF_SUM)
     print(query_result)
-    expected_result = """          subject     marks
-0  Social science  0.399558
-1         english  0.000000
-2           maths  0.200883
-3         science  0.399558"""
+    expected_result = """          subject  PROPORTION_OF_SUM of marks
+0  Social science                    0.399558
+1         english                    0.000000
+2           maths                    0.200883
+3         science                    0.399558"""
 
-    expected_suggestions = "[{'suggestion': 'There exists negative values among the values on which proportion is being applied', 'oversight': <Oversights.ATTRIBUTION_WITH_HIDDEN_NEGATIVES: 11>, 'is_row_level_suggestion': True, 'confidence_score': 1, 'row_list': [{'row': 3, 'confidence_score': 1}, {'row': 4, 'confidence_score': 1}]}]"
+    expected_suggestions = "[{'suggestion': 'There exists negative values among the values on which proportion is being applied', 'oversight': <Oversights.ATTRIBUTION_WITH_HIDDEN_NEGATIVES: 11>, 'is_row_level_suggestion': True, 'confidence_score': 1, 'row_list': [{'row': 2, 'confidence_score': 1}, {'row': 3, 'confidence_score': 1}]}]"
 
     assert(expected_result == query_result[0].to_string())
     assert(expected_suggestions == str(query_result[1]))

diff --git a/intents/test_slice_compare.py b/intents/test_slice_compare.py
@@ -38,9 +38,9 @@ def test_1():
                                                )
     print(query_result)
 
-    expected_result = """   season         batsman_team  total_runs
-0    2008  Chennai Super Kings         868
-1    2008       Mumbai Indians         346"""
+    expected_result = """   season         batsman_team  SUM of total_runs
+0    2008  Chennai Super Kings                868
+1    2008       Mumbai Indians                346"""
     expected_suggestion = """[]"""
 
     assert(expected_result == query_result[0].to_string())
@@ -60,10 +60,10 @@ def test_2():
                                                dimensions = ['year'])
     print(query_result)
 
-    expected_result = """   year Person name  salary
-0  2019           A   10239
-1  2019           B    8190"""
-    expected_suggestion = "[{'suggestion': 'the relation between slices might changed a lot if you will consider month in grouping.', 'oversight': <Oversights.SIMPSONS_PARADOX: 8>, 'is_row_level_suggestion': True, 'row_list': [{'row': 1, 'confidence_score': 100}, {'row': 2, 'confidence_score': 100}]}]"
+    expected_result = """   year Person name  SUM of salary
+0  2019           A          10239
+1  2019           B           8190"""
+    expected_suggestion = """[{'suggestion': 'the relation between slices might changed a lot if you will consider month in grouping.', 'oversight': <Oversights.SIMPSONS_PARADOX: 8>, 'is_row_level_suggestion': True, 'row_list': [{'row': 1, 'confidence_score': 100}, {'row': 2, 'confidence_score': 100}]}]"""
 
     assert(expected_result == query_result[0].to_string())
     assert(expected_suggestion == str(query_result[1]))
@@ -82,29 +82,29 @@ def test_3():
                                                slices = [('innings', Filters.IN, ['1st', '2nd'])])
     print(query_result)
 
-    expected_output = """                   batsman_team innings  total_runs
-0           Chennai Super Kings     1st         544
-1           Chennai Super Kings     2nd         324
-2               Deccan Chargers     1st          40
-3               Deccan Chargers     2nd         102
-4              Delhi Daredevils     1st         248
-5              Delhi Daredevils     2nd         342
-6                 Gujarat Lions     1st         100
-7                 Gujarat Lions     2nd           4
-8               Kings XI Punjab     1st         448
-9               Kings XI Punjab     2nd         522
-10        Kolkata Knight Riders     1st         338
-11        Kolkata Knight Riders     2nd         708
-12               Mumbai Indians     1st         330
-13               Mumbai Indians     2nd          16
-14                Pune Warriors     1st          12
-15                Pune Warriors     2nd         158
-16             Rajasthan Royals     1st         368
-17             Rajasthan Royals     2nd         608
-18  Royal Challengers Bangalore     1st         866
-19  Royal Challengers Bangalore     2nd         136
-20          Sunrisers Hyderabad     1st          63
-21          Sunrisers Hyderabad     2nd         331"""
+    expected_output = """                   batsman_team innings  SUM of total_runs
+0           Chennai Super Kings     1st                544
+1           Chennai Super Kings     2nd                324
+2               Deccan Chargers     1st                 40
+3               Deccan Chargers     2nd                102
+4              Delhi Daredevils     1st                248
+5              Delhi Daredevils     2nd                342
+6                 Gujarat Lions     1st                100
+7                 Gujarat Lions     2nd                  4
+8               Kings XI Punjab     1st                448
+9               Kings XI Punjab     2nd                522
+10        Kolkata Knight Riders     1st                338
+11        Kolkata Knight Riders     2nd                708
+12               Mumbai Indians     1st                330
+13               Mumbai Indians     2nd                 16
+14                Pune Warriors     1st                 12
+15                Pune Warriors     2nd                158
+16             Rajasthan Royals     1st                368
+17             Rajasthan Royals     2nd                608
+18  Royal Challengers Bangalore     1st                866
+19  Royal Challengers Bangalore     2nd                136
+20          Sunrisers Hyderabad     1st                 63
+21          Sunrisers Hyderabad     2nd                331"""
     expected_suggestion = """[]"""
 
     assert(expected_output == query_result[0].to_string())
@@ -140,12 +140,11 @@ def test_5():
                                                dimensions = ['class'])
     print(query_result)
 
-    expected_output = """  class student_name  marks
-0   7th            A     75
-1   7th            B     75
-2   8th            A     75
-3   8th            B     75"""
-
+    expected_output = """  class student_name  MEAN of marks
+0   7th            A             75
+1   7th            B             75
+2   8th            A             75
+3   8th            B             75"""
     expected_suggestion = "[{'suggestion': 'Median is very different from the Mean', 'oversight': <Oversights.MEAN_VS_MEDIAN: 7>, 'is_row_level_suggestion': True, 'confidence_score': -3.0792014356780038, 'row_list': [{'row': 1, 'confidence_score': -3.0792014356780038}, {'row': 2, 'confidence_score': 3.0792014356780038}, {'row': 3, 'confidence_score': 3.0792014356780038}, {'row': 4, 'confidence_score': -3.0792014356780038}]}, {'suggestion': 'Some values are similar here but will vary if we add subject for grouping ', 'oversight': <Oversights.TOP_DOWN_ERROR: 9>, 'is_row_level_suggestion': True, 'row_list': [{'row': 1, 'confidence_score': 100}, {'row': 2, 'confidence_score': 100}, {'row': 3, 'confidence_score': 100}, {'row': 4, 'confidence_score': 100}]}, {'suggestion': 'the relation between slices might changed a lot if you will consider subject in grouping.', 'oversight': <Oversights.SIMPSONS_PARADOX: 8>, 'is_row_level_suggestion': True, 'row_list': [{'row': 1, 'confidence_score': 100}, {'row': 2, 'confidence_score': 100}]}]"
 
     assert(expected_output == query_result[0].to_string())

diff --git a/intents/test_time_compare.py b/intents/test_time_compare.py
@@ -40,9 +40,9 @@ def test_1():
                                              )
     print(query_result[0])
 
-    expected_result = """  team_name            date_of_match  total_run
-0        MI  01/01/2008 - 31/12/2009        776
-1        MI  01/01/2010 - 31/12/2011        420"""
+    expected_result = """  team_name            date_of_match  SUM of total_run
+0        MI  01/01/2008 - 31/12/2009               776
+1        MI  01/01/2010 - 31/12/2011               420"""
     expected_suggestions = "[]"
 
     assert(expected_result == query_result[0].to_string())
@@ -67,9 +67,9 @@ def test_2():
                                              dimensions = ['home_team', 'away_team', 'country'])
     print(query_result)
 
-    expected_result = """  home_team away_team  country                     date  tournament
-0   England     Wales  England  1871-11-30 - 1950-12-30          33
-1   England     Wales  England  1950-12-31 - 2020-01-01          19"""
+    expected_result = """  home_team away_team  country                     date  COUNT of tournament
+0   England     Wales  England  1871-11-30 - 1950-12-30                   33
+1   England     Wales  England  1950-12-31 - 2020-01-01                   19"""
     expected_suggestions = "[]"
 
     assert(expected_result == query_result[0].to_string())

diff --git a/intents/test_topk.py b/intents/test_topk.py
@@ -63,12 +63,13 @@ def test_2():
                                 date_format='%Y-%m-%d',
     	                        summary_operator=enums.SummaryOperators.MEAN)
     print(query_result)
-    expected_result = """  player_of_match  win_by_runs
-0       MM Sharma           14
-1         KK Nair            7
-2         WP Saha            7
-3         SS Iyer            0"""
+    expected_result = """  player_of_match  MEAN of win_by_runs
+0       MM Sharma                   14
+1         KK Nair                    7
+2         WP Saha                    7
+3         SS Iyer                    0"""
     expected_suggestions = """[{'suggestion': 'Instead of 5 only 4 rows are present in the results', 'oversight': <Oversights.TOPK_WHEN_LESS_THAN_K_PRESENT: 2>}, {'oversight': <Oversights.REGRESSION_TO_THE_MEAN: 4>, 'suggestion': "very few of the top-k in the given date range will be in the previous window's top-k"}]"""
+
     assert(expected_result == query_result[0].to_string())
     assert(expected_suggestions == str(query_result[1]))
 
@@ -85,8 +86,8 @@ def test_3():
                                 date_format='%Y-%m-%d',
     	                        summary_operator=enums.SummaryOperators.COUNT)
     print(query_result)
-    expected_result = """  Creation  Department_ID
-0     1789              2"""
+    expected_result = """  Creation  COUNT of Department_ID
+0     1789                       2"""
     expected_suggestions = """[{'oversight': <Oversights.TOPK_VS_OTHERS: 6>, 'change_list': {'topKLimit': 14}, 'suggestion': 'The rows NOT in the top-k have a much larger sum over Department_ID than the rows in top-k', 'confidence_score': 0.15384615384615385}]"""
     assert(expected_result == query_result[0].to_string())
     assert(expected_suggestions == str(query_result[1]))
@@ -126,12 +127,12 @@ def test_5():
     	                        date_format='%Y-%m-%d',
     	                        summary_operator=enums.SummaryOperators.MAX)
     print(query_result)
-    expected_result = """            city        lat
-0  San Francisco  37.804770
-1   Redwood City  37.491269
-2      Palo Alto  37.448598
-3  Mountain View  37.406940
-4       San Jose  37.352601"""
+    expected_result = """            city  MAX of lat
+0  San Francisco   37.804770
+1   Redwood City   37.491269
+2      Palo Alto   37.448598
+3  Mountain View   37.406940
+4       San Jose   37.352601"""
     expected_suggestions = """[]"""
     assert(expected_result == query_result[0].to_string())
     assert(expected_suggestions == str(query_result[1]))

diff --git a/intents/time_compare.py b/intents/time_compare.py
@@ -31,6 +31,10 @@ def time_compare(table, metric, all_dimensions, time_compare_column, date_range1
 
     """ This function returns both the results according to the intent
     as well as the debiasing suggestions.
+
+    Also, if summary operator is applied, the name of metric column is
+    renamed to "<summary operator> of metric".
+
     Some of the oversights considered in this intent are-
     Args:
         table: Type-pandas.dataframe
@@ -126,6 +130,9 @@ def time_compare(table, metric, all_dimensions, time_compare_column, date_range1
 
     suggestions = rank_oversights.rank_oversights(suggestions, order)
 
+    if summary_operator is not None:
+        result_table = aspects.update_metric_column_name(result_table, summary_operator, metric)
+
     return (result_table, suggestions)
 
 def _time_compare_results(table, metric, time_compare_column, 

diff --git a/intents/topk.py b/intents/topk.py
@@ -32,7 +32,10 @@
 def topk(table, metric, dimensions, is_asc, k, **kwargs):
     """ This function returns both the results according to the intent
     as well as the debiasing suggestions.
-    
+
+    Also, if summary operator is applied, the name of metric column is
+    renamed to "<summary operator> of metric".
+
     Oversights that may be detected in top-k
     1. Regression to the mean
     2. Looking at tails to find causes
@@ -148,6 +151,9 @@ def topk(table, metric, dimensions, is_asc, k, **kwargs):
     order = oversights_order.ORDER_IN_TOPK
     suggestions = rank_oversights.rank_oversights(suggestions, order)
 
+    if summary_operator is not None:
+        result_table = aspects.update_metric_column_name(result_table, summary_operator, metric)
+
     return (result_table, suggestions)
 
 def topk_results(table, metric, dimensions, is_asc, k, **kwargs):

diff --git a/intents/util/aspects.py b/intents/util/aspects.py
@@ -473,4 +473,35 @@ def granular_time(row_date, granularity):
     if granularity == enums.Granularities.ANNUALLY:
         row_date = row_date.replace(second=0, minute=0, hour=0, day=1, month=1)
 
-    return row_date
+    return row_date
+
+def update_metric_column_name(table, summary_operator, metric):
+    """
+    The function updates the name of the metric column to
+    '<summary_operator> of metric'.
+
+    Args:
+        table: Type-Pandas.DataFrame 
+            The table in which the name of metric is to be updated
+        summary_operator: Type-SummaryOperators enum members
+            It denotes the summary operator
+        metric: Type-string
+            It is the name of the column on which
+            summary operator is applied in case of grouping. Metric could a column
+            containing strings, if we are applying count operator on it.
+
+    Returns:
+        The table with name of metric column updated.
+    """
+    if metric is None:
+        return table
+    # New name of metric column
+    updated_metric_name = '{} of {}'.format(summary_operator.name, metric)
+
+    # Create new column 
+    table[updated_metric_name] = table[metric]
+
+    # Drop old column
+    table = table.drop([metric], axis=1)
+
+    return table