cmu-delphi · melange396 · Aug 29, 2023 · Aug 16, 2023 · Aug 16, 2023 · Aug 21, 2023
diff --git a/_delphi_utils_python/delphi_utils/runner.py b/_delphi_utils_python/delphi_utils/runner.py
@@ -67,6 +67,7 @@ def run_indicator_pipeline(indicator_fn:  Callable[[Params], None],
     validator = validator_fn(params)
     archiver = archiver_fn(params)
 
+    start_time = time.time()
     t1 = multiprocessing.Process(target=flash_fn, args=[params])
     t1.start()
     start = time.time()
@@ -77,6 +78,10 @@ def run_indicator_pipeline(indicator_fn:  Callable[[Params], None],
     else:
         t1.terminate()
         t1.join()
+    elapsed_time_in_seconds = round(time.time() - start_time, 2)
+    logger.info("Completed flash step",
+            elapsed_time_in_seconds = elapsed_time_in_seconds)
+
     if validator:
         validation_report = validator.validate()
         validation_report.log(logger)

diff --git a/_delphi_utils_python/delphi_utils/validator/dynamic.py b/_delphi_utils_python/delphi_utils/validator/dynamic.py
@@ -237,7 +237,8 @@ def check_min_allowed_max_date(self, max_date, geo_type, signal_type, report):
                 ValidationFailure("check_min_max_date",
                                   geo_type=geo_type,
                                   signal=signal_type,
-                                  message="date of most recent generated file seems too long ago"))
+                                  message="date of most recent generated file seems too long ago "
+                                  f"({max_date} < {self.params.generation_date} - {min_thres})"))
 
         report.increment_total_checks()
 
@@ -263,7 +264,8 @@ def check_max_allowed_max_date(self, max_date, geo_type, signal_type, report):
                 ValidationFailure("check_max_max_date",
                                   geo_type=geo_type,
                                   signal=signal_type,
-                                  message="date of most recent generated file seems too recent"))
+                                  message="date of most recent generated file seems too recent "
+                                  f"({max_date} > {self.params.generation_date} - {max_thres})"))
 
         report.increment_total_checks()
 
@@ -307,7 +309,9 @@ def create_dfs(self, geo_sig_df, api_df_or_error, checking_date, geo_type, signa
                                     signal_type,
                                     "test data for a given checking date-geo type-signal type"
                                     " combination is missing. Source data may be missing"
-                                    " for one or more dates"))
+                                    " for one or more dates "
+                                    f"({checking_date} < {self.params.generation_date} "
+                                    f"- {min_thres})"))
             return False
 
         # Reference dataframe runs backwards from the recent_cutoff_date
@@ -418,7 +422,9 @@ def check_max_date_vs_reference(self, df_to_test, df_to_reference, checking_date
                                   checking_date,
                                   geo_type,
                                   signal_type,
-                                  "reference df has days beyond the max date in the =df_to_test="))
+                                  "reference df has days beyond the max date in the =df_to_test= "
+                                  f"{df_to_test['time_value'].max()} < "
+                                  f"{df_to_reference['time_value'].max().date()}"))
 
         report.increment_total_checks()
 
@@ -459,7 +465,8 @@ def check_rapid_change_num_rows(self, df_to_test, df_to_reference, checking_date
                                   geo_type,
                                   signal_type,
                                   "Number of rows per day seems to have changed rapidly (reference "
-                                  "vs test data)"))
+                                  "vs test data); "
+                                  f"relative difference: {abs(compare_rows)} > 0.35"))
         report.increment_total_checks()
 
     def check_positive_negative_spikes(self, source_df, api_frames, geo, sig, report):

diff --git a/_delphi_utils_python/delphi_utils/validator/report.py b/_delphi_utils_python/delphi_utils/validator/report.py
@@ -35,6 +35,8 @@ def __init__(self, errors_to_suppress: List[ValidationFailure],
             Warnings raised from validation execution
         unsuppressed_errors: List[Exception]
             Errors raised from validation failures not found in `self.errors_to_suppress`
+        elapsed_time_in_seconds: float
+            Elapsed time of validation run, rounded down
         """
         self.errors_to_suppress = errors_to_suppress
         self.data_source = data_source
@@ -44,6 +46,7 @@ def __init__(self, errors_to_suppress: List[ValidationFailure],
         self.raised_warnings = []
         self.unsuppressed_errors = []
         self.dry_run = dry_run
+        self.elapsed_time_in_seconds = 0
     # pylint: enable=R0902
 
     def add_raised_error(self, error):
@@ -68,6 +71,10 @@ def increment_total_checks(self):
         """Record a check."""
         self.total_checks += 1
 
+    def set_elapsed_time_in_seconds(self, time):
+        """Set elapsed runtime in seconds for later logging."""
+        self.elapsed_time_in_seconds = time
+
     def add_raised_warning(self, warning):
         """Add a warning to the report.
 
@@ -94,15 +101,17 @@ def log(self, logger=None):
                 checks_failed = len(self.unsuppressed_errors),
                 checks_suppressed = self.num_suppressed,
                 warnings = len(self.raised_warnings),
-                phase = "validation")
+                phase = "validation",
+                elapsed_time_in_seconds=self.elapsed_time_in_seconds)
         else:
             logger.info("Validation run unsuccessful",
                 data_source = self.data_source,
                 checks_run = self.total_checks,
                 checks_failed = len(self.unsuppressed_errors),
                 checks_suppressed = self.num_suppressed,
                 warnings = len(self.raised_warnings),
-                phase="validation")
+                phase="validation",
+                elapsed_time_in_seconds=self.elapsed_time_in_seconds)
         # Threshold for slack alerts if warnings are excessive,
         # Currently extremely strict, set by observation of 1 month's logs
         excessive_warnings = self.total_checks > 0 and \

diff --git a/_delphi_utils_python/delphi_utils/validator/static.py b/_delphi_utils_python/delphi_utils/validator/static.py
@@ -230,7 +230,7 @@ def find_all_unexpected_geo_ids(df_to_test, geo_regex, geo_type):
                         ValidationFailure(
                             "check_geo_id_type",
                             filename=nameformat,
-                            message="geo_ids saved as floats; strings preferred"))
+                            message=f"{len(leftover)} geo_ids saved as floats; strings preferred"))
 
             if geo_type in fill_len.keys():
                 # Left-pad with zeroes up to expected length. Fixes missing leading zeroes
@@ -281,29 +281,35 @@ def check_bad_val(self, df_to_test, nameformat, signal_type, report):
 
         if percent_option:
             if not df_to_test[(df_to_test['val'] > 100)].empty:
+                bad_values = df_to_test[(df_to_test['val'] > 100)]['val'].unique()
                 report.add_raised_error(
                     ValidationFailure(
                         "check_val_pct_gt_100",
                         filename=nameformat,
-                        message="val column can't have any cell greater than 100 for percents"))
+                        message="val column can't have any cell greater than 100 for percents; "
+                                f"invalid values: {bad_values}"))
 
             report.increment_total_checks()
 
         if proportion_option:
             if not df_to_test[(df_to_test['val'] > 100000)].empty:
+                bad_values = df_to_test[(df_to_test['val'] > 100000)]['val'].unique()
                 report.add_raised_error(
                     ValidationFailure("check_val_prop_gt_100k",
                                       filename=nameformat,
                                       message="val column can't have any cell greater than 100000 "
-                                              "for proportions"))
+                                              "for proportions; "
+                                              f"invalid values: {bad_values}"))
 
             report.increment_total_checks()
 
         if not df_to_test[(df_to_test['val'] < 0)].empty:
+            bad_values = df_to_test[(df_to_test['val'] < 0)]['val'].unique()
             report.add_raised_error(
                 ValidationFailure("check_val_lt_0",
                                   filename=nameformat,
-                                  message="val column can't have any cell smaller than 0"))
+                                  message="val column can't have any cell smaller than 0; "
+                                          f"invalid values: {bad_values}"))
 
         report.increment_total_checks()
 
@@ -346,10 +352,12 @@ def check_bad_se(self, df_to_test, nameformat, report):
             report.increment_total_checks()
 
             if df_to_test["se"].isnull().mean() > 0.5:
+                bad_mean = round(df_to_test["se"].isnull().mean() * 100, 2)
                 report.add_raised_error(
                     ValidationFailure("check_se_many_missing",
                                       filename=nameformat,
-                                      message='Recent se values are >50% NA'))
+                                      message='Many recent se values are missing: '
+                                              f'{bad_mean} > 50%'))
 
             report.increment_total_checks()
 

diff --git a/_delphi_utils_python/delphi_utils/validator/validate.py b/_delphi_utils_python/delphi_utils/validator/validate.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 """Tools to validate CSV source data, including various check methods."""
+import time
 from .datafetcher import load_all_files
 from .dynamic import DynamicValidator
 from .errors import ValidationFailure
@@ -54,11 +55,13 @@ def validate(self):
         Returns:
             - ValidationReport collating the validation outcomes
         """
+        start_time = time.time()
         report = ValidationReport(self.suppressed_errors, self.data_source, self.dry_run)
         frames_list = load_all_files(self.export_dir, self.time_window.start_date,
                                      self.time_window.end_date)
         self.static_validation.validate(frames_list, report)
         # Dynamic Validation only performed when frames_list is populated
         if len(frames_list) > 0:
             self.dynamic_validation.validate(aggregate_frames(frames_list), report)
+        report.set_elapsed_time_in_seconds(round(time.time() - start_time, 2))
         return report