fix: copy raw reports for parallel experiment (#414)

* copy raw reports for parallel * fix tests
codecov · Apr 25, 2024 · c71ddfd · c71ddfd
1 parent bcec347
commit c71ddfd
Show file tree

Hide file tree

Showing 5 changed files with 48 additions and 29 deletions.
diff --git a/helpers/parallel_upload_processing.py b/helpers/parallel_upload_processing.py
@@ -117,9 +117,10 @@ def save_final_serial_report_results(
         report_code,
         f"serial/chunks<latest_upload_pk:{latest_upload_pk}>",
     )
-    archive_service.write_parallel_experiment_file(
+    report_url = archive_service.write_parallel_experiment_file(
         commitid,
         files_and_sessions,
         report_code,
         f"serial/files_and_sessions<latest_upload_pk:{latest_upload_pk}>",
     )
+    return report_url
diff --git a/services/report/__init__.py b/services/report/__init__.py
@@ -830,7 +830,7 @@ async def create_new_report_for_commit(self, commit: Commit) -> Report:
 
     @sentry_sdk.trace
     def parse_raw_report_from_storage(
-        self, repo: Repository, upload: Upload, is_parallel=False
+        self, repo: Repository, upload: Upload, is_parallel=False, is_error_case=False
     ) -> ParsedRawReport:
         """Pulls the raw uploaded report from storage and parses it do it's
         easier to access different parts of the raw upload.
@@ -841,14 +841,35 @@ def parse_raw_report_from_storage(
         archive_service = self.get_archive_service(repo)
         archive_url = upload.storage_path
 
-        # for the parallel experiment: since the parallel version runs after the old behaviour
-        # has finished, the current uploads have already been rewritten in a human readable
-        # format, so we need to use the legacy parser here for the parallel version.
-        parser = get_proper_parser(upload, use_legacy=is_parallel)
+        # For the parallel upload verification experiment, we need to make a copy of the raw uploaded reports
+        # so that the parallel pipeline can use those to parse. The serial pipeline rewrites the raw uploaded
+        # reports to a human readable version that doesn't include file fixes, so that's why copying is necessary.
+        if PARALLEL_UPLOAD_PROCESSING_BY_REPO.check_value(
+            repo_id=repo.repoid, default=False
+        ) and (not is_error_case):
+            parallel_url = archive_url.removesuffix(".txt") + "_PARALLEL.txt"
+            if not is_parallel:
+                archive_file = archive_service.read_file(archive_url)
+                archive_service.write_file(parallel_url, archive_file)
+                log.info(
+                    "Copying raw report file for parallel experiment to: "
+                    + str(parallel_url),
+                    extra=dict(commit=upload.report.commit_id, repoid=repo.repoid),
+                )
+            else:
+                archive_url = parallel_url
+                archive_file = archive_service.read_file(archive_url)
+                log.info(
+                    "Reading raw report file for parallel experiment from: "
+                    + str(archive_url),
+                    extra=dict(commit=upload.report.commit_id, repoid=repo.repoid),
+                )
+        else:
+            archive_file = archive_service.read_file(archive_url)
 
-        raw_uploaded_report = parser.parse_raw_report_from_bytes(
-            archive_service.read_file(archive_url)
-        )
+        parser = get_proper_parser(upload)
+
+        raw_uploaded_report = parser.parse_raw_report_from_bytes(archive_file)
         return raw_uploaded_report
 
     @sentry_sdk.trace

diff --git a/services/report/parser/__init__.py b/services/report/parser/__init__.py
@@ -3,9 +3,7 @@
 from services.report.parser.version_one import VersionOneReportParser
 
 
-def get_proper_parser(upload: Upload, use_legacy=None):
-    if use_legacy:
-        return LegacyReportParser()
+def get_proper_parser(upload: Upload):
     if upload.upload_extras and upload.upload_extras.get("format_version") == "v1":
         return VersionOneReportParser()
     return LegacyReportParser()
diff --git a/tasks/tests/unit/test_upload_processing_task.py b/tasks/tests/unit/test_upload_processing_task.py
@@ -555,7 +555,7 @@ def test_upload_task_call_exception_within_individual_upload(
         assert upload.state_id == UploadState.ERROR.db_id
         assert upload.state == "error"
         assert not mocked_3.called
-        mocked_4.assert_called_with(commit.repository, upload)
+        mocked_4.assert_called_with(commit.repository, upload, is_error_case=True)
         mocked_5.assert_called()
 
     @pytest.mark.django_db(databases={"default"})

diff --git a/tasks/upload_processor.py b/tasks/upload_processor.py
@@ -354,27 +354,26 @@ def process_impl_within_lock(
                 # experiment. The report being saved is not necessarily the final
                 # report for the commit, as more uploads can still be made.
                 if is_final and (not in_parallel):
-                    save_final_serial_report_results(
+                    final_serial_report_url = save_final_serial_report_results(
                         report_service, commit, report, report_code, arguments_list
                     )
+                    log.info(
+                        "Saved final serial report results to storage",
+                        extra=dict(
+                            repoid=repoid,
+                            commit=commitid,
+                            final_serial_result_path=final_serial_report_url,
+                        ),
+                    )
 
             for processed_individual_report in processings_so_far:
-                # We delete and rewrite the artifacts when the serial flow runs first. When
-                # the parallel flow runs second, it parses the human readable artifacts instead
-                # since the serial flow already rewrote it
-                if not (
-                    PARALLEL_UPLOAD_PROCESSING_BY_REPO.check_value(
-                        repo_id=repository.repoid
-                    )
-                    and in_parallel
-                ):
-                    deleted_archive = self._possibly_delete_archive(
+                deleted_archive = self._possibly_delete_archive(
+                    processed_individual_report, report_service, commit
+                )
+                if not deleted_archive:
+                    self._rewrite_raw_report_readable(
                         processed_individual_report, report_service, commit
                     )
-                    if not deleted_archive:
-                        self._rewrite_raw_report_readable(
-                            processed_individual_report, report_service, commit
-                        )
                 processed_individual_report.pop("upload_obj", None)
                 processed_individual_report.pop("raw_report", None)
             log.info(
@@ -503,7 +502,7 @@ def _attempt_rewrite_raw_report_readable_error_case(
         )
         try:
             raw_report = report_service.parse_raw_report_from_storage(
-                commit.repository, upload
+                commit.repository, upload, is_error_case=True
             )
             self._rewrite_raw_report_readable(
                 processing_result={"raw_report": raw_report, "upload_object": upload},