Read pdf hash feature (#32387)

* Started to implement * Extracted hashes * Added handling of hashes * Added RNs * Updated yml and readme * Ran pre-commit * Updated docker image * Fixed comments from CR * Enhanced documentation * Added to secrets ignore
demisto · Jan 25, 2024 · e332fb2 · e332fb2
1 parent faf6725
commit e332fb2
Show file tree

Hide file tree

Showing 9 changed files with 146 additions and 20 deletions.
diff --git a/Packs/CommonScripts/.secrets-ignore b/Packs/CommonScripts/.secrets-ignore
@@ -116,4 +116,6 @@ testing@gmail.com
 a853a1b0-1ffe-4e37-d9a9-a27c6bc0bd5b@gmail.com
 cipher=ECDHE-RSA-AES128-GCM-SHA256
 07.23.05.38
-multipart/signed
+multipart/signed
+123.123.123.123
+107.66.225.91
diff --git a/Packs/CommonScripts/ReleaseNotes/1_13_24.md b/Packs/CommonScripts/ReleaseNotes/1_13_24.md
@@ -0,0 +1,7 @@
+
+#### Scripts
+
+##### ReadPDFFileV2
+
+- Added support for extracting hashes.
+- Updated the Docker image to: *demisto/readpdf:1.0.0.85832*.
diff --git a/Packs/CommonScripts/Scripts/ReadPDFFileV2/README.md b/Packs/CommonScripts/Scripts/ReadPDFFileV2/README.md
@@ -1,4 +1,4 @@
-Loads a PDF file's content and metadata into context.
+Load a PDF file's content and metadata into context. Supports extraction of hashes, urls, and emails when available.
 
 
 ## Script Data
@@ -55,3 +55,5 @@ Loads a PDF file's content and metadata into context.
 | File.UserProperties | Indicates the presence of the structure elements that contain user properties attributes. | String |
 | File.Extension | The file's extension. | String |
 | Account.Email | The email address of the account. | String |
+| Hashes.type | The hash type extracted from the PDF file. | String |
+| Hashes.value | The hash value extracted from the PDF file. | String |
diff --git a/Packs/CommonScripts/Scripts/ReadPDFFileV2/ReadPDFFileV2.py b/Packs/CommonScripts/Scripts/ReadPDFFileV2/ReadPDFFileV2.py
@@ -9,7 +9,6 @@
 import re
 import shutil
 import json
-from typing import List, Set, Tuple
 from pikepdf import Pdf, PasswordError
 import contextlib
 import io
@@ -27,7 +26,6 @@ class PdfPermissionsException(Exception):
     Every exception class that is in charge of catching errors that occur when trying to
     extract data from the PDF must inherit this class
     """
-    pass
 
 
 class PdfCopyingProtectedException(PdfPermissionsException):
@@ -36,15 +34,13 @@ class PdfCopyingProtectedException(PdfPermissionsException):
     a `copy-protected` file (Copy-protected files are files that prevent us from copy its content)
     This is relevant since we run a command that copies the content of the pdf file into a text file.
     """
-    pass
 
 
 class PdfInvalidCredentialsException(PdfPermissionsException):
     """
     This class is in charge of catching errors that occur when we try to decrypt an encrypted
     pdf file with the wrong password.
     """
-    pass
 
 
 # Error class for shell errors
@@ -120,7 +116,7 @@ def run_shell_command(command: str, *args) -> bytes:
     """Runs shell command and returns the result if not encountered an error"""
     cmd = [command] + list(args)
     demisto.debug(f'Running the shell command {cmd=}')
-    completed_process = subprocess.run(
+    completed_process = subprocess.run(  # noqa: UP022
         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
     )
     exit_codes = completed_process.returncode
@@ -159,9 +155,9 @@ def get_files_names_in_path(path: str, name_of_file: str, full_path: bool = Fals
     return res
 
 
-def get_images_paths_in_path(path: str) -> List[str]:
+def get_images_paths_in_path(path: str) -> list[str]:
     """Gets images paths from path"""
-    res: List[str] = []
+    res: list[str] = []
     for img_type in IMG_FORMATS:
         img_format = f"*.{img_type}"
         res.extend(get_files_names_in_path(path, img_format, True))
@@ -250,14 +246,15 @@ def get_pdf_htmls_content(pdf_path: str, output_folder: str) -> str:
 
 
 def build_readpdf_entry_object(entry_id: str, metadata: dict, text: str, urls: list, emails: list, images: list[str],
-                               max_images: int) -> list[dict[str, Any]]:
+                               max_images: int,
+                               hash_contexts: list[dict[str, Any]] | None = None) -> list[dict[str, Any]]:
     """Builds an entry object for the main script flow"""
     pdf_file = {"EntryID": entry_id}
     # Add Text to file entity
     pdf_file["Text"] = text
 
     # Add Metadata to file entity
-    for k in metadata.keys():
+    for k in metadata:
         pdf_file[k] = metadata[k]
 
     md = "### Metadata\n"
@@ -306,6 +303,8 @@ def build_readpdf_entry_object(entry_id: str, metadata: dict, text: str, urls: l
         indicators_map = json.loads(indicators_map)
         if emails:
             indicators_map["Email"] = emails
+        if hash_contexts:
+            indicators_map['Hashes'] = hash_contexts
     except json.JSONDecodeError:
         pass
     ec = build_readpdf_entry_context(indicators_map)
@@ -334,6 +333,8 @@ def build_readpdf_entry_context(indicators_map: Any) -> dict:
             for email in indicators_map["Email"]:
                 ec_email.append({"Email": email})
             ec["Account"] = ec_email
+        if 'Hashes' in indicators_map:
+            ec['Hashes'] = indicators_map['Hashes']
     return ec
 
 
@@ -351,7 +352,7 @@ def get_urls_from_binary_file(file_path: str) -> set:
     return binary_file_urls
 
 
-def get_urls_and_emails_from_pdf_html_content(cpy_file_path: str, output_folder: str) -> Tuple[set, set]:
+def get_urls_and_emails_from_pdf_html_content(cpy_file_path: str, output_folder: str) -> tuple[set, set]:
     """
     Extract the URLs and emails from the pdf html content.
 
@@ -386,6 +387,8 @@ def extract_url_from_annot_object(annot_object: Any):
             if isinstance(url, PyPDF2.generic.IndirectObject):
                 url = url.get_object()
             return url
+        return None
+    return None
 
 
 def extract_url(extracted_object: Any):
@@ -461,16 +464,16 @@ def extract_urls_and_emails_from_annot_objects(annot_objects: list | Any):
     return urls, emails
 
 
-def get_urls_and_emails_from_pdf_annots(file_path: str) -> Tuple[set, set]:
+def get_urls_and_emails_from_pdf_annots(file_path: str) -> tuple[set, set]:
     """
     Extracts the URLs and Emails from the pdf's Annots (Annotations and Commenting) using PyPDF2 package.
     Args:
         file_path (str): The path of the PDF file.
     Returns:
         Tuple[set, set]: A set includes the URLs that were found, A set includes the Emails that were found.
     """
-    all_urls: Set[str] = set()
-    all_emails: Set[str] = set()
+    all_urls: set[str] = set()
+    all_emails: set[str] = set()
     output_capture = io.StringIO()
     with open(file_path, 'rb') as pdf_file:
         # The following context manager was added so we could redirect error messages to the server logs since
@@ -510,7 +513,7 @@ def get_urls_and_emails_from_pdf_annots(file_path: str) -> Tuple[set, set]:
     return all_urls, all_emails
 
 
-def extract_urls_and_emails_from_pdf_file(file_path: str, output_folder: str) -> Tuple[list, list]:
+def extract_urls_and_emails_from_pdf_file(file_path: str, output_folder: str) -> tuple[list, list]:
     """
     Extract URLs and Emails from the PDF file.
     Args:
@@ -545,6 +548,57 @@ def extract_urls_and_emails_from_pdf_file(file_path: str, output_folder: str) ->
     return urls_ec, emails_ec
 
 
+def extract_hash_contexts_from_pdf_file(file_text: str) -> list[dict[str, Any]]:
+    """Extracts the hashes from the file's text, and converts them to hash contexts.
+
+    Args:
+        file_text (str): The text extracted from the PDF.
+
+    Returns:
+        list[dict[str, Any]]: A list of hash contexts.
+    """
+    hash_contexts: list[dict[str, Any]] = []
+    hashes_in_file = get_hashes_from_file(file_text)
+    for hash_type, hashes in hashes_in_file.items():
+        if hashes:
+            hash_contexts.extend(convert_hash_to_context(hash_type, hashes))
+    return hash_contexts
+
+
+def convert_hash_to_context(hash_type: str, hashes: set[Any]) -> list[dict[str, Any]]:
+    """Converts the given hashes to hash contexts
+
+    Args:
+        hash_type (str): The hash type of the given hashes.
+        hashes (set[Any]): The set of hashes.
+
+    Returns:
+        list[dict[str, Any]]: A list of hash contexts that have the same hash type.
+    """
+    hash_context: list[dict[str, Any]] = [{'type': hash_type, 'value': hash} for hash in hashes]
+    return hash_context
+
+
+def get_hashes_from_file(file_text: str) -> dict[str, set[Any]]:
+    """Extracts all the hashes found in the file's text.
+
+    Args:
+        file_text (str): The file's text.
+
+    Returns:
+        dict[str, set[Any]]: A dictionary that holds the hash types as keys, and each key
+        holds the set of hashes corresponding to that hash type.
+    """
+    demisto.debug('Extracting hashes from file')
+    hashes: dict[str, set[Any]] = {}
+    hashes['SHA1'] = set(re.findall(sha1Regex, file_text))
+    hashes['SHA256'] = set(re.findall(sha256Regex, file_text))
+    hashes['SHA512'] = set(re.findall(sha512Regex, file_text))
+    hashes['MD5'] = set(re.findall(md5Regex, file_text))
+
+    return hashes
+
+
 def handling_pdf_credentials(cpy_file_path: str, dec_file_path: str, encrypted: str = '',
                              user_password: str = '') -> str:
     """
@@ -577,6 +631,9 @@ def extract_data_from_pdf(path: str, user_password: str, entry_id: str, max_imag
         pdf_text_output_path = f"{working_dir}/PDFText.txt"
         text = get_pdf_text(cpy_file_path, pdf_text_output_path)
 
+        # Get hash contexts
+        hash_contexts = extract_hash_contexts_from_pdf_file(text)
+
         # Get URLS + emails:
         urls_ec, emails_ec = extract_urls_and_emails_from_pdf_file(cpy_file_path, working_dir)
 
@@ -588,7 +645,8 @@ def extract_data_from_pdf(path: str, user_password: str, entry_id: str, max_imag
                                                           urls_ec,
                                                           emails_ec,
                                                           images,
-                                                          max_images=max_images)
+                                                          max_images=max_images,
+                                                          hash_contexts=hash_contexts)
 
         return_results(readpdf_entry_object)
     else:

diff --git a/Packs/CommonScripts/Scripts/ReadPDFFileV2/ReadPDFFileV2.yml b/Packs/CommonScripts/Scripts/ReadPDFFileV2/ReadPDFFileV2.yml
@@ -9,13 +9,19 @@ args:
   description: Maximum number of images to extract from the PDF file.
   name: maxImages
 comment: |-
-  Load a PDF file's content and metadata into context.
+  Load a PDF file's content and metadata into context. Supports extraction of hashes, urls, and emails when available.
 commonfields:
   id: ReadPDFFileV2
   version: -1
 enabled: true
 name: ReadPDFFileV2
 outputs:
+- contextPath: Hashes.type
+  description: The hash type extracted from the PDF file.
+  type: String
+- contextPath: Hashes.value
+  description: The hash value extracted from the PDF file.
+  type: String
 - contextPath: URL.Data
   description: A list of URLs that were extracted from the PDF file.
   type: String
@@ -116,7 +122,7 @@ tags:
 - ingestion
 timeout: "0"
 type: python
-dockerimage: demisto/readpdf:1.0.0.83506
+dockerimage: demisto/readpdf:1.0.0.85832
 runas: DBotRole
 tests:
 - Extract Indicators From File - Generic v2 - Test

diff --git a/Packs/CommonScripts/Scripts/ReadPDFFileV2/ReadPDFFileV2_test.py b/Packs/CommonScripts/Scripts/ReadPDFFileV2/ReadPDFFileV2_test.py
@@ -13,6 +13,46 @@ def open_html_file(file):
         return f.read()
 
 
+def test_extract_hash_contexts():
+    """
+    Given
+        - A PDF with hashes in it.
+    When
+        - Trying extract the hashes from the file.
+    Then
+        - Validate that the hashes were extracted successfully.
+    """
+    from ReadPDFFileV2 import extract_hash_contexts_from_pdf_file, get_pdf_text
+    expected_hash_contexts = [{'type': 'SHA1', 'value': 'aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d'},
+                              {'type': 'SHA256', 'value': '2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824'},
+                              {'type': 'SHA256', 'value': '8732331accf45f86a00ca823cb24d9806ec1380846a337ac86b4fe6f9d06f1f5'},
+                              {'type': 'MD5', 'value': '5d41402abc4b2a76b9719d911017c592'}]
+    # We first extract the file's text, and then extract the hashes
+    pdf_text_output_path = f"{CWD}/PDFText.txt"
+    file_text = get_pdf_text(f'{CWD}/pdf-with-hashes.pdf', pdf_text_output_path)
+    hash_contexts = extract_hash_contexts_from_pdf_file(file_text)
+    assert len(hash_contexts) == len(expected_hash_contexts)
+    for hash_context in hash_contexts:
+        assert hash_context in expected_hash_contexts
+
+
+def test_hash_contexts_in_return_results():
+    """
+    Given
+        - A hash context to add to the entry context.
+    When
+        - Building the entry context.
+    Then
+        - Validate that the hash context was added.
+    """
+    from ReadPDFFileV2 import build_readpdf_entry_context
+    hashes = {'Hashes': [
+        {'type': 'SHA1', 'value': 'aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d'},
+        {'type': 'MD5', 'value': '5d41402abc4b2a76b9719d911017c592'}]}
+    entry_context = build_readpdf_entry_context(hashes)
+    assert entry_context == hashes
+
+
 def test_urls_are_found_correctly(mocker):
     """
     Given

diff --git a/Packs/CommonScripts/Scripts/ReadPDFFileV2/test_data/PDFText.txt b/Packs/CommonScripts/Scripts/ReadPDFFileV2/test_data/PDFText.txt
@@ -0,0 +1,11 @@
+Dummy PDF
+SHA256: 2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824
+MD5: 5d41402abc4b2a76b9719d911017c592
+SHA1: aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d
+SHA1: aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d
+SHA256: 8732331accf45f86a00ca823cb24d9806ec1380846a337ac86b4fe6f9d06f1f5
+some@example.com
+IPs: 123.123.123.123, 107.66.225.91
+URLs: google.com, https://example.com/
+
+
diff --git a/Packs/CommonScripts/Scripts/ReadPDFFileV2/test_data/pdf-with-hashes.pdf b/Packs/CommonScripts/Scripts/ReadPDFFileV2/test_data/pdf-with-hashes.pdf
diff --git a/Packs/CommonScripts/pack_metadata.json b/Packs/CommonScripts/pack_metadata.json
@@ -2,7 +2,7 @@
     "name": "Common Scripts",
     "description": "Frequently used scripts pack.",
     "support": "xsoar",
-    "currentVersion": "1.13.23",
+    "currentVersion": "1.13.24",
     "author": "Cortex XSOAR",
     "url": "https://www.paloaltonetworks.com/cortex",
     "email": "",