Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add extraction of certificate data. #389

Merged
merged 6 commits into from
Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 100 additions & 23 deletions src/sec_certs/dataset/cc.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,27 @@
"""
return self.targets_dir / "txt"

@property
def certificates_dir(self) -> Path:
"""
Returns directory that holds files associated with the certificates
"""
return self.certs_dir / "certificates"

@property
def certificates_pdf_dir(self) -> Path:
"""
Returns directory that holds PDFs associated with certificates
"""
return self.certificates_dir / "pdf"

@property
def certificates_txt_dir(self) -> Path:
"""
Returns directory that holds TXTs associated with certificates
"""
return self.certificates_dir / "txt"

@property
def pp_dataset_path(self) -> Path:
"""
Expand Down Expand Up @@ -242,7 +263,14 @@
self.auxiliary_datasets.mu_dset.root_dir = self.mu_dataset_dir

for cert in self:
cert.set_local_paths(self.reports_pdf_dir, self.targets_pdf_dir, self.reports_txt_dir, self.targets_txt_dir)
cert.set_local_paths(
self.reports_pdf_dir,
self.targets_pdf_dir,
self.certificates_pdf_dir,
self.reports_txt_dir,
self.targets_txt_dir,
self.certificates_txt_dir,
)
# TODO: This forgets to set local paths for other auxiliary datasets

def _merge_certs(self, certs: dict[str, CCCertificate], cert_source: str | None = None) -> None:
Expand Down Expand Up @@ -531,11 +559,12 @@
def _download_all_artifacts_body(self, fresh: bool = True) -> None:
self._download_reports(fresh)
self._download_targets(fresh)
self._download_certs(fresh)

@staged(logger, "Downloading PDFs of CC certification reports.")
def _download_reports(self, fresh: bool = True) -> None:
self.reports_pdf_dir.mkdir(parents=True, exist_ok=True)
certs_to_process = [x for x in self if x.state.report_is_ok_to_download(fresh) and x.report_link]
certs_to_process = [x for x in self if x.state.report.is_ok_to_download(fresh) and x.report_link]

if not fresh and certs_to_process:
logger.info(
Expand All @@ -551,7 +580,7 @@
@staged(logger, "Downloading PDFs of CC security targets.")
def _download_targets(self, fresh: bool = True) -> None:
self.targets_pdf_dir.mkdir(parents=True, exist_ok=True)
certs_to_process = [x for x in self if x.state.report_is_ok_to_download(fresh)]
certs_to_process = [x for x in self if x.state.st.is_ok_to_download(fresh)]

if not fresh and certs_to_process:
logger.info(
Expand All @@ -564,10 +593,26 @@
progress_bar_desc="Downloading PDFs of CC security targets",
)

@staged(logger, "Downloading PDFs of CC certificates.")
def _download_certs(self, fresh: bool = True) -> None:
self.certificates_pdf_dir.mkdir(parents=True, exist_ok=True)
certs_to_process = [x for x in self if x.state.cert.is_ok_to_download(fresh)]

if not fresh and certs_to_process:
logger.info(
f"Downloading {len(certs_to_process)} PDFs of CC certificates for which previous download failed.."
)

cert_processing.process_parallel(
CCCertificate.download_pdf_cert,
certs_to_process,
progress_bar_desc="Downloading PDFs of CC certificates",
)

@staged(logger, "Converting PDFs of certification reports to txt.")
def _convert_reports_to_txt(self, fresh: bool = True) -> None:
self.reports_txt_dir.mkdir(parents=True, exist_ok=True)
certs_to_process = [x for x in self if x.state.report_is_ok_to_convert(fresh)]
certs_to_process = [x for x in self if x.state.report.is_ok_to_convert(fresh)]

if not fresh and certs_to_process:
logger.info(
Expand All @@ -583,7 +628,7 @@
@staged(logger, "Converting PDFs of security targets to txt.")
def _convert_targets_to_txt(self, fresh: bool = True) -> None:
self.targets_txt_dir.mkdir(parents=True, exist_ok=True)
certs_to_process = [x for x in self if x.state.st_is_ok_to_convert(fresh)]
certs_to_process = [x for x in self if x.state.st.is_ok_to_convert(fresh)]

if fresh:
logger.info("Converting PDFs of security targets to txt.")
Expand All @@ -598,13 +643,32 @@
progress_bar_desc="Converting PDFs of security targets to txt",
)

@staged(logger, "Converting PDFs of certificates to txt.")
def _convert_certs_to_txt(self, fresh: bool = True) -> None:
self.certificates_txt_dir.mkdir(parents=True, exist_ok=True)
certs_to_process = [x for x in self if x.state.cert.is_ok_to_convert(fresh)]

if fresh:
logger.info("Converting PDFs of certificates to txt.")
if not fresh and certs_to_process:
logger.info(

Check warning on line 654 in src/sec_certs/dataset/cc.py

View check run for this annotation

Codecov / codecov/patch

src/sec_certs/dataset/cc.py#L654

Added line #L654 was not covered by tests
f"Converting {len(certs_to_process)} PDFs of certificates to txt for which previous conversion failed."
)

cert_processing.process_parallel(
CCCertificate.convert_cert_pdf,
certs_to_process,
progress_bar_desc="Converting PDFs of certificates to txt",
)

def _convert_all_pdfs_body(self, fresh: bool = True) -> None:
self._convert_reports_to_txt(fresh)
self._convert_targets_to_txt(fresh)
self._convert_certs_to_txt(fresh)

@staged(logger, "Extracting report metadata")
def _extract_report_metadata(self) -> None:
certs_to_process = [x for x in self if x.state.report_is_ok_to_analyze()]
certs_to_process = [x for x in self if x.state.report.is_ok_to_analyze()]
processed_certs = cert_processing.process_parallel(
CCCertificate.extract_report_pdf_metadata,
certs_to_process,
Expand All @@ -615,7 +679,7 @@

@staged(logger, "Extracting target metadata")
def _extract_target_metadata(self) -> None:
certs_to_process = [x for x in self if x.state.st_is_ok_to_analyze()]
certs_to_process = [x for x in self if x.state.st.is_ok_to_analyze()]
processed_certs = cert_processing.process_parallel(
CCCertificate.extract_st_pdf_metadata,
certs_to_process,
Expand All @@ -624,13 +688,25 @@
)
self.update_with_certs(processed_certs)

@staged(logger, "Extracting cert metadata")
def _extract_cert_metadata(self) -> None:
certs_to_process = [x for x in self if x.state.cert.is_ok_to_analyze()]
processed_certs = cert_processing.process_parallel(
CCCertificate.extract_cert_pdf_metadata,
certs_to_process,
use_threading=False,
progress_bar_desc="Extracting cert metadata",
)
self.update_with_certs(processed_certs)

def _extract_pdf_metadata(self) -> None:
self._extract_report_metadata()
self._extract_target_metadata()
self._extract_cert_metadata()

@staged(logger, "Extracting report frontpages")
def _extract_report_frontpage(self) -> None:
certs_to_process = [x for x in self if x.state.report_is_ok_to_analyze()]
certs_to_process = [x for x in self if x.state.report.is_ok_to_analyze()]
processed_certs = cert_processing.process_parallel(
CCCertificate.extract_report_pdf_frontpage,
certs_to_process,
Expand All @@ -639,24 +715,13 @@
)
self.update_with_certs(processed_certs)

@staged(logger, "Extracting target frontpages")
def _extract_target_frontpage(self) -> None:
certs_to_process = [x for x in self if x.state.st_is_ok_to_analyze()]
processed_certs = cert_processing.process_parallel(
CCCertificate.extract_st_pdf_frontpage,
certs_to_process,
use_threading=False,
progress_bar_desc="Extracting target frontpages",
)
self.update_with_certs(processed_certs)

def _extract_pdf_frontpage(self) -> None:
self._extract_report_frontpage()
self._extract_target_frontpage()
# We have no frontpage extraction for targets or certificates themselves, only for the reports.

@staged(logger, "Extracting report keywords")
def _extract_report_keywords(self) -> None:
certs_to_process = [x for x in self if x.state.report_is_ok_to_analyze()]
certs_to_process = [x for x in self if x.state.report.is_ok_to_analyze()]
processed_certs = cert_processing.process_parallel(
CCCertificate.extract_report_pdf_keywords,
certs_to_process,
Expand All @@ -667,7 +732,7 @@

@staged(logger, "Extracting target keywords")
def _extract_target_keywords(self) -> None:
certs_to_process = [x for x in self if x.state.st_is_ok_to_analyze()]
certs_to_process = [x for x in self if x.state.st.is_ok_to_analyze()]
processed_certs = cert_processing.process_parallel(
CCCertificate.extract_st_pdf_keywords,
certs_to_process,
Expand All @@ -676,9 +741,21 @@
)
self.update_with_certs(processed_certs)

@staged(logger, "Extracting cert keywords")
def _extract_cert_keywords(self) -> None:
certs_to_process = [x for x in self if x.state.cert.is_ok_to_analyze()]
processed_certs = cert_processing.process_parallel(
CCCertificate.extract_cert_pdf_keywords,
certs_to_process,
use_threading=False,
progress_bar_desc="Extracting cert keywords",
)
self.update_with_certs(processed_certs)

def _extract_pdf_keywords(self) -> None:
self._extract_report_keywords()
self._extract_target_keywords()
self._extract_cert_keywords()

def extract_data(self) -> None:
logger.info("Extracting various data from certification artifacts")
Expand All @@ -688,7 +765,7 @@

@staged(logger, "Computing heuristics: Deriving information about laboratories involved in certification.")
def _compute_cert_labs(self) -> None:
certs_to_process = [x for x in self if x.state.report_is_ok_to_analyze()]
certs_to_process = [x for x in self if x.state.report.is_ok_to_analyze()]
for cert in certs_to_process:
cert.compute_heuristics_cert_lab()

Expand Down
4 changes: 2 additions & 2 deletions src/sec_certs/model/cc_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,10 @@ def match(self, cert: CCCertificate) -> float:
if self._product == cert.name and self._vendor == cert.manufacturer:
return 99
# If we match the report hash, return early.
if cert.state.report_pdf_hash == self._report_hash and self._report_hash is not None:
if cert.state.report.pdf_hash == self._report_hash and self._report_hash is not None:
return 95
# If we match the target hash, return early.
if cert.state.st_pdf_hash == self._target_hash and self._target_hash is not None:
if cert.state.st.pdf_hash == self._target_hash and self._target_hash is not None:
return 93

# Fuzzy match at the end with some penalization.
Expand Down
8 changes: 4 additions & 4 deletions src/sec_certs/model/references_nlp/segment_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,9 +173,9 @@
- Loads manually annotated samples
- Combines all of that into single dataframe
"""
target_certs = [x for x in certs if x.heuristics.st_references.directly_referencing and x.state.st_txt_path]
target_certs = [x for x in certs if x.heuristics.st_references.directly_referencing and x.state.st.txt_path]

Check warning on line 176 in src/sec_certs/model/references_nlp/segment_extractor.py

View check run for this annotation

Codecov / codecov/patch

src/sec_certs/model/references_nlp/segment_extractor.py#L176

Added line #L176 was not covered by tests
report_certs = [
x for x in certs if x.heuristics.report_references.directly_referencing and x.state.report_txt_path
x for x in certs if x.heuristics.report_references.directly_referencing and x.state.report.txt_path
]
df_targets = self._build_df(target_certs, "target")
df_reports = self._build_df(report_certs, "report")
Expand Down Expand Up @@ -217,8 +217,8 @@
for key, val in actual_references.items()
]

(certs[0].state.report_txt_path.parent.parent / "txt_processed").mkdir(exist_ok=True, parents=True)
(certs[0].state.st_txt_path.parent.parent / "txt_processed").mkdir(exist_ok=True, parents=True)
(certs[0].state.report.txt_path.parent.parent / "txt_processed").mkdir(exist_ok=True, parents=True)
(certs[0].state.st.txt_path.parent.parent / "txt_processed").mkdir(exist_ok=True, parents=True)

Check warning on line 221 in src/sec_certs/model/references_nlp/segment_extractor.py

View check run for this annotation

Codecov / codecov/patch

src/sec_certs/model/references_nlp/segment_extractor.py#L220-L221

Added lines #L220 - L221 were not covered by tests
return list(itertools.chain.from_iterable(get_cert_records(cert, source) for cert in certs))

def _build_df(self, certs: list[CCCertificate], source: Literal["target", "report"]) -> pd.DataFrame:
Expand Down
Loading
Loading