crocs-muni · J08nY · Feb 14, 2024 · Feb 7, 2024 · Feb 13, 2024 · Feb 13, 2024
diff --git a/src/sec_certs/dataset/cc.py b/src/sec_certs/dataset/cc.py
@@ -143,6 +143,27 @@
         """
         return self.targets_dir / "txt"
 
+    @property
+    def certificates_dir(self) -> Path:
+        """
+        Returns directory that holds files associated with the certificates
+        """
+        return self.certs_dir / "certificates"
+
+    @property
+    def certificates_pdf_dir(self) -> Path:
+        """
+        Returns directory that holds PDFs associated with certificates
+        """
+        return self.certificates_dir / "pdf"
+
+    @property
+    def certificates_txt_dir(self) -> Path:
+        """
+        Returns directory that holds TXTs associated with certificates
+        """
+        return self.certificates_dir / "txt"
+
     @property
     def pp_dataset_path(self) -> Path:
         """
@@ -242,7 +263,14 @@
             self.auxiliary_datasets.mu_dset.root_dir = self.mu_dataset_dir
 
         for cert in self:
-            cert.set_local_paths(self.reports_pdf_dir, self.targets_pdf_dir, self.reports_txt_dir, self.targets_txt_dir)
+            cert.set_local_paths(
+                self.reports_pdf_dir,
+                self.targets_pdf_dir,
+                self.certificates_pdf_dir,
+                self.reports_txt_dir,
+                self.targets_txt_dir,
+                self.certificates_txt_dir,
+            )
         # TODO: This forgets to set local paths for other auxiliary datasets
 
     def _merge_certs(self, certs: dict[str, CCCertificate], cert_source: str | None = None) -> None:
@@ -531,11 +559,12 @@
     def _download_all_artifacts_body(self, fresh: bool = True) -> None:
         self._download_reports(fresh)
         self._download_targets(fresh)
+        self._download_certs(fresh)
 
     @staged(logger, "Downloading PDFs of CC certification reports.")
     def _download_reports(self, fresh: bool = True) -> None:
         self.reports_pdf_dir.mkdir(parents=True, exist_ok=True)
-        certs_to_process = [x for x in self if x.state.report_is_ok_to_download(fresh) and x.report_link]
+        certs_to_process = [x for x in self if x.state.report.is_ok_to_download(fresh) and x.report_link]
 
         if not fresh and certs_to_process:
             logger.info(
@@ -551,7 +580,7 @@
     @staged(logger, "Downloading PDFs of CC security targets.")
     def _download_targets(self, fresh: bool = True) -> None:
         self.targets_pdf_dir.mkdir(parents=True, exist_ok=True)
-        certs_to_process = [x for x in self if x.state.report_is_ok_to_download(fresh)]
+        certs_to_process = [x for x in self if x.state.st.is_ok_to_download(fresh)]
 
         if not fresh and certs_to_process:
             logger.info(
@@ -564,10 +593,26 @@
             progress_bar_desc="Downloading PDFs of CC security targets",
         )
 
+    @staged(logger, "Downloading PDFs of CC certificates.")
+    def _download_certs(self, fresh: bool = True) -> None:
+        self.certificates_pdf_dir.mkdir(parents=True, exist_ok=True)
+        certs_to_process = [x for x in self if x.state.cert.is_ok_to_download(fresh)]
+
+        if not fresh and certs_to_process:
+            logger.info(
+                f"Downloading {len(certs_to_process)} PDFs of CC certificates for which previous download failed.."
+            )
+
+        cert_processing.process_parallel(
+            CCCertificate.download_pdf_cert,
+            certs_to_process,
+            progress_bar_desc="Downloading PDFs of CC certificates",
+        )
+
     @staged(logger, "Converting PDFs of certification reports to txt.")
     def _convert_reports_to_txt(self, fresh: bool = True) -> None:
         self.reports_txt_dir.mkdir(parents=True, exist_ok=True)
-        certs_to_process = [x for x in self if x.state.report_is_ok_to_convert(fresh)]
+        certs_to_process = [x for x in self if x.state.report.is_ok_to_convert(fresh)]
 
         if not fresh and certs_to_process:
             logger.info(
@@ -583,7 +628,7 @@
     @staged(logger, "Converting PDFs of security targets to txt.")
     def _convert_targets_to_txt(self, fresh: bool = True) -> None:
         self.targets_txt_dir.mkdir(parents=True, exist_ok=True)
-        certs_to_process = [x for x in self if x.state.st_is_ok_to_convert(fresh)]
+        certs_to_process = [x for x in self if x.state.st.is_ok_to_convert(fresh)]
 
         if fresh:
             logger.info("Converting PDFs of security targets to txt.")
@@ -598,13 +643,32 @@
             progress_bar_desc="Converting PDFs of security targets to txt",
         )
 
+    @staged(logger, "Converting PDFs of certificates to txt.")
+    def _convert_certs_to_txt(self, fresh: bool = True) -> None:
+        self.certificates_txt_dir.mkdir(parents=True, exist_ok=True)
+        certs_to_process = [x for x in self if x.state.cert.is_ok_to_convert(fresh)]
+
+        if fresh:
+            logger.info("Converting PDFs of certificates to txt.")
+        if not fresh and certs_to_process:
+            logger.info(
+                f"Converting {len(certs_to_process)} PDFs of certificates to txt for which previous conversion failed."
+            )
+
+        cert_processing.process_parallel(
+            CCCertificate.convert_cert_pdf,
+            certs_to_process,
+            progress_bar_desc="Converting PDFs of certificates to txt",
+        )
+
     def _convert_all_pdfs_body(self, fresh: bool = True) -> None:
         self._convert_reports_to_txt(fresh)
         self._convert_targets_to_txt(fresh)
+        self._convert_certs_to_txt(fresh)
 
     @staged(logger, "Extracting report metadata")
     def _extract_report_metadata(self) -> None:
-        certs_to_process = [x for x in self if x.state.report_is_ok_to_analyze()]
+        certs_to_process = [x for x in self if x.state.report.is_ok_to_analyze()]
         processed_certs = cert_processing.process_parallel(
             CCCertificate.extract_report_pdf_metadata,
             certs_to_process,
@@ -615,7 +679,7 @@
 
     @staged(logger, "Extracting target metadata")
     def _extract_target_metadata(self) -> None:
-        certs_to_process = [x for x in self if x.state.st_is_ok_to_analyze()]
+        certs_to_process = [x for x in self if x.state.st.is_ok_to_analyze()]
         processed_certs = cert_processing.process_parallel(
             CCCertificate.extract_st_pdf_metadata,
             certs_to_process,
@@ -624,13 +688,25 @@
         )
         self.update_with_certs(processed_certs)
 
+    @staged(logger, "Extracting cert metadata")
+    def _extract_cert_metadata(self) -> None:
+        certs_to_process = [x for x in self if x.state.cert.is_ok_to_analyze()]
+        processed_certs = cert_processing.process_parallel(
+            CCCertificate.extract_cert_pdf_metadata,
+            certs_to_process,
+            use_threading=False,
+            progress_bar_desc="Extracting cert metadata",
+        )
+        self.update_with_certs(processed_certs)
+
     def _extract_pdf_metadata(self) -> None:
         self._extract_report_metadata()
         self._extract_target_metadata()
+        self._extract_cert_metadata()
 
     @staged(logger, "Extracting report frontpages")
     def _extract_report_frontpage(self) -> None:
-        certs_to_process = [x for x in self if x.state.report_is_ok_to_analyze()]
+        certs_to_process = [x for x in self if x.state.report.is_ok_to_analyze()]
         processed_certs = cert_processing.process_parallel(
             CCCertificate.extract_report_pdf_frontpage,
             certs_to_process,
@@ -639,24 +715,13 @@
         )
         self.update_with_certs(processed_certs)
 
-    @staged(logger, "Extracting target frontpages")
-    def _extract_target_frontpage(self) -> None:
-        certs_to_process = [x for x in self if x.state.st_is_ok_to_analyze()]
-        processed_certs = cert_processing.process_parallel(
-            CCCertificate.extract_st_pdf_frontpage,
-            certs_to_process,
-            use_threading=False,
-            progress_bar_desc="Extracting target frontpages",
-        )
-        self.update_with_certs(processed_certs)
-
     def _extract_pdf_frontpage(self) -> None:
         self._extract_report_frontpage()
-        self._extract_target_frontpage()
+        # We have no frontpage extraction for targets or certificates themselves, only for the reports.
 
     @staged(logger, "Extracting report keywords")
     def _extract_report_keywords(self) -> None:
-        certs_to_process = [x for x in self if x.state.report_is_ok_to_analyze()]
+        certs_to_process = [x for x in self if x.state.report.is_ok_to_analyze()]
         processed_certs = cert_processing.process_parallel(
             CCCertificate.extract_report_pdf_keywords,
             certs_to_process,
@@ -667,7 +732,7 @@
 
     @staged(logger, "Extracting target keywords")
     def _extract_target_keywords(self) -> None:
-        certs_to_process = [x for x in self if x.state.st_is_ok_to_analyze()]
+        certs_to_process = [x for x in self if x.state.st.is_ok_to_analyze()]
         processed_certs = cert_processing.process_parallel(
             CCCertificate.extract_st_pdf_keywords,
             certs_to_process,
@@ -676,9 +741,21 @@
         )
         self.update_with_certs(processed_certs)
 
+    @staged(logger, "Extracting cert keywords")
+    def _extract_cert_keywords(self) -> None:
+        certs_to_process = [x for x in self if x.state.cert.is_ok_to_analyze()]
+        processed_certs = cert_processing.process_parallel(
+            CCCertificate.extract_cert_pdf_keywords,
+            certs_to_process,
+            use_threading=False,
+            progress_bar_desc="Extracting cert keywords",
+        )
+        self.update_with_certs(processed_certs)
+
     def _extract_pdf_keywords(self) -> None:
         self._extract_report_keywords()
         self._extract_target_keywords()
+        self._extract_cert_keywords()
 
     def extract_data(self) -> None:
         logger.info("Extracting various data from certification artifacts")
@@ -688,7 +765,7 @@
 
     @staged(logger, "Computing heuristics: Deriving information about laboratories involved in certification.")
     def _compute_cert_labs(self) -> None:
-        certs_to_process = [x for x in self if x.state.report_is_ok_to_analyze()]
+        certs_to_process = [x for x in self if x.state.report.is_ok_to_analyze()]
         for cert in certs_to_process:
             cert.compute_heuristics_cert_lab()
 

diff --git a/src/sec_certs/model/cc_matching.py b/src/sec_certs/model/cc_matching.py
@@ -75,10 +75,10 @@ def match(self, cert: CCCertificate) -> float:
         if self._product == cert.name and self._vendor == cert.manufacturer:
             return 99
         # If we match the report hash, return early.
-        if cert.state.report_pdf_hash == self._report_hash and self._report_hash is not None:
+        if cert.state.report.pdf_hash == self._report_hash and self._report_hash is not None:
             return 95
         # If we match the target hash, return early.
-        if cert.state.st_pdf_hash == self._target_hash and self._target_hash is not None:
+        if cert.state.st.pdf_hash == self._target_hash and self._target_hash is not None:
             return 93
 
         # Fuzzy match at the end with some penalization.

diff --git a/src/sec_certs/model/references_nlp/segment_extractor.py b/src/sec_certs/model/references_nlp/segment_extractor.py
@@ -173,9 +173,9 @@
         - Loads manually annotated samples
         - Combines all of that into single dataframe
         """
-        target_certs = [x for x in certs if x.heuristics.st_references.directly_referencing and x.state.st_txt_path]
+        target_certs = [x for x in certs if x.heuristics.st_references.directly_referencing and x.state.st.txt_path]
         report_certs = [
-            x for x in certs if x.heuristics.report_references.directly_referencing and x.state.report_txt_path
+            x for x in certs if x.heuristics.report_references.directly_referencing and x.state.report.txt_path
         ]
         df_targets = self._build_df(target_certs, "target")
         df_reports = self._build_df(report_certs, "report")
@@ -217,8 +217,8 @@
                 for key, val in actual_references.items()
             ]
 
-        (certs[0].state.report_txt_path.parent.parent / "txt_processed").mkdir(exist_ok=True, parents=True)
-        (certs[0].state.st_txt_path.parent.parent / "txt_processed").mkdir(exist_ok=True, parents=True)
+        (certs[0].state.report.txt_path.parent.parent / "txt_processed").mkdir(exist_ok=True, parents=True)
+        (certs[0].state.st.txt_path.parent.parent / "txt_processed").mkdir(exist_ok=True, parents=True)
         return list(itertools.chain.from_iterable(get_cert_records(cert, source) for cert in certs))
 
     def _build_df(self, certs: list[CCCertificate], source: Literal["target", "report"]) -> pd.DataFrame: