Skip to content

Commit

Permalink
Read pdf hash feature (#32387)
Browse files Browse the repository at this point in the history
* Started to implement

* Extracted hashes

* Added handling of hashes

* Added RNs

* Updated yml and readme

* Ran pre-commit

* Updated docker image

* Fixed comments from CR

* Enhanced documentation

* Added to secrets ignore
  • Loading branch information
anas-yousef committed Jan 25, 2024
1 parent faf6725 commit e332fb2
Show file tree
Hide file tree
Showing 9 changed files with 146 additions and 20 deletions.
4 changes: 3 additions & 1 deletion Packs/CommonScripts/.secrets-ignore
Expand Up @@ -116,4 +116,6 @@ testing@gmail.com
a853a1b0-1ffe-4e37-d9a9-a27c6bc0bd5b@gmail.com
cipher=ECDHE-RSA-AES128-GCM-SHA256
07.23.05.38
multipart/signed
multipart/signed
123.123.123.123
107.66.225.91
7 changes: 7 additions & 0 deletions Packs/CommonScripts/ReleaseNotes/1_13_24.md
@@ -0,0 +1,7 @@

#### Scripts

##### ReadPDFFileV2

- Added support for extracting hashes.
- Updated the Docker image to: *demisto/readpdf:1.0.0.85832*.
4 changes: 3 additions & 1 deletion Packs/CommonScripts/Scripts/ReadPDFFileV2/README.md
@@ -1,4 +1,4 @@
Loads a PDF file's content and metadata into context.
Load a PDF file's content and metadata into context. Supports extraction of hashes, urls, and emails when available.


## Script Data
Expand Down Expand Up @@ -55,3 +55,5 @@ Loads a PDF file's content and metadata into context.
| File.UserProperties | Indicates the presence of the structure elements that contain user properties attributes. | String |
| File.Extension | The file's extension. | String |
| Account.Email | The email address of the account. | String |
| Hashes.type | The hash type extracted from the PDF file. | String |
| Hashes.value | The hash value extracted from the PDF file. | String |
88 changes: 73 additions & 15 deletions Packs/CommonScripts/Scripts/ReadPDFFileV2/ReadPDFFileV2.py
Expand Up @@ -9,7 +9,6 @@
import re
import shutil
import json
from typing import List, Set, Tuple
from pikepdf import Pdf, PasswordError
import contextlib
import io
Expand All @@ -27,7 +26,6 @@ class PdfPermissionsException(Exception):
Every exception class that is in charge of catching errors that occur when trying to
extract data from the PDF must inherit this class
"""
pass


class PdfCopyingProtectedException(PdfPermissionsException):
Expand All @@ -36,15 +34,13 @@ class PdfCopyingProtectedException(PdfPermissionsException):
a `copy-protected` file (Copy-protected files are files that prevent us from copy its content)
This is relevant since we run a command that copies the content of the pdf file into a text file.
"""
pass


class PdfInvalidCredentialsException(PdfPermissionsException):
"""
This class is in charge of catching errors that occur when we try to decrypt an encrypted
pdf file with the wrong password.
"""
pass


# Error class for shell errors
Expand Down Expand Up @@ -120,7 +116,7 @@ def run_shell_command(command: str, *args) -> bytes:
"""Runs shell command and returns the result if not encountered an error"""
cmd = [command] + list(args)
demisto.debug(f'Running the shell command {cmd=}')
completed_process = subprocess.run(
completed_process = subprocess.run( # noqa: UP022
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
exit_codes = completed_process.returncode
Expand Down Expand Up @@ -159,9 +155,9 @@ def get_files_names_in_path(path: str, name_of_file: str, full_path: bool = Fals
return res


def get_images_paths_in_path(path: str) -> List[str]:
def get_images_paths_in_path(path: str) -> list[str]:
"""Gets images paths from path"""
res: List[str] = []
res: list[str] = []
for img_type in IMG_FORMATS:
img_format = f"*.{img_type}"
res.extend(get_files_names_in_path(path, img_format, True))
Expand Down Expand Up @@ -250,14 +246,15 @@ def get_pdf_htmls_content(pdf_path: str, output_folder: str) -> str:


def build_readpdf_entry_object(entry_id: str, metadata: dict, text: str, urls: list, emails: list, images: list[str],
max_images: int) -> list[dict[str, Any]]:
max_images: int,
hash_contexts: list[dict[str, Any]] | None = None) -> list[dict[str, Any]]:
"""Builds an entry object for the main script flow"""
pdf_file = {"EntryID": entry_id}
# Add Text to file entity
pdf_file["Text"] = text

# Add Metadata to file entity
for k in metadata.keys():
for k in metadata:
pdf_file[k] = metadata[k]

md = "### Metadata\n"
Expand Down Expand Up @@ -306,6 +303,8 @@ def build_readpdf_entry_object(entry_id: str, metadata: dict, text: str, urls: l
indicators_map = json.loads(indicators_map)
if emails:
indicators_map["Email"] = emails
if hash_contexts:
indicators_map['Hashes'] = hash_contexts
except json.JSONDecodeError:
pass
ec = build_readpdf_entry_context(indicators_map)
Expand Down Expand Up @@ -334,6 +333,8 @@ def build_readpdf_entry_context(indicators_map: Any) -> dict:
for email in indicators_map["Email"]:
ec_email.append({"Email": email})
ec["Account"] = ec_email
if 'Hashes' in indicators_map:
ec['Hashes'] = indicators_map['Hashes']
return ec


Expand All @@ -351,7 +352,7 @@ def get_urls_from_binary_file(file_path: str) -> set:
return binary_file_urls


def get_urls_and_emails_from_pdf_html_content(cpy_file_path: str, output_folder: str) -> Tuple[set, set]:
def get_urls_and_emails_from_pdf_html_content(cpy_file_path: str, output_folder: str) -> tuple[set, set]:
"""
Extract the URLs and emails from the pdf html content.
Expand Down Expand Up @@ -386,6 +387,8 @@ def extract_url_from_annot_object(annot_object: Any):
if isinstance(url, PyPDF2.generic.IndirectObject):
url = url.get_object()
return url
return None
return None


def extract_url(extracted_object: Any):
Expand Down Expand Up @@ -461,16 +464,16 @@ def extract_urls_and_emails_from_annot_objects(annot_objects: list | Any):
return urls, emails


def get_urls_and_emails_from_pdf_annots(file_path: str) -> Tuple[set, set]:
def get_urls_and_emails_from_pdf_annots(file_path: str) -> tuple[set, set]:
"""
Extracts the URLs and Emails from the pdf's Annots (Annotations and Commenting) using PyPDF2 package.
Args:
file_path (str): The path of the PDF file.
Returns:
Tuple[set, set]: A set includes the URLs that were found, A set includes the Emails that were found.
"""
all_urls: Set[str] = set()
all_emails: Set[str] = set()
all_urls: set[str] = set()
all_emails: set[str] = set()
output_capture = io.StringIO()
with open(file_path, 'rb') as pdf_file:
# The following context manager was added so we could redirect error messages to the server logs since
Expand Down Expand Up @@ -510,7 +513,7 @@ def get_urls_and_emails_from_pdf_annots(file_path: str) -> Tuple[set, set]:
return all_urls, all_emails


def extract_urls_and_emails_from_pdf_file(file_path: str, output_folder: str) -> Tuple[list, list]:
def extract_urls_and_emails_from_pdf_file(file_path: str, output_folder: str) -> tuple[list, list]:
"""
Extract URLs and Emails from the PDF file.
Args:
Expand Down Expand Up @@ -545,6 +548,57 @@ def extract_urls_and_emails_from_pdf_file(file_path: str, output_folder: str) ->
return urls_ec, emails_ec


def extract_hash_contexts_from_pdf_file(file_text: str) -> list[dict[str, Any]]:
"""Extracts the hashes from the file's text, and converts them to hash contexts.
Args:
file_text (str): The text extracted from the PDF.
Returns:
list[dict[str, Any]]: A list of hash contexts.
"""
hash_contexts: list[dict[str, Any]] = []
hashes_in_file = get_hashes_from_file(file_text)
for hash_type, hashes in hashes_in_file.items():
if hashes:
hash_contexts.extend(convert_hash_to_context(hash_type, hashes))
return hash_contexts


def convert_hash_to_context(hash_type: str, hashes: set[Any]) -> list[dict[str, Any]]:
"""Converts the given hashes to hash contexts
Args:
hash_type (str): The hash type of the given hashes.
hashes (set[Any]): The set of hashes.
Returns:
list[dict[str, Any]]: A list of hash contexts that have the same hash type.
"""
hash_context: list[dict[str, Any]] = [{'type': hash_type, 'value': hash} for hash in hashes]
return hash_context


def get_hashes_from_file(file_text: str) -> dict[str, set[Any]]:
"""Extracts all the hashes found in the file's text.
Args:
file_text (str): The file's text.
Returns:
dict[str, set[Any]]: A dictionary that holds the hash types as keys, and each key
holds the set of hashes corresponding to that hash type.
"""
demisto.debug('Extracting hashes from file')
hashes: dict[str, set[Any]] = {}
hashes['SHA1'] = set(re.findall(sha1Regex, file_text))
hashes['SHA256'] = set(re.findall(sha256Regex, file_text))
hashes['SHA512'] = set(re.findall(sha512Regex, file_text))
hashes['MD5'] = set(re.findall(md5Regex, file_text))

return hashes


def handling_pdf_credentials(cpy_file_path: str, dec_file_path: str, encrypted: str = '',
user_password: str = '') -> str:
"""
Expand Down Expand Up @@ -577,6 +631,9 @@ def extract_data_from_pdf(path: str, user_password: str, entry_id: str, max_imag
pdf_text_output_path = f"{working_dir}/PDFText.txt"
text = get_pdf_text(cpy_file_path, pdf_text_output_path)

# Get hash contexts
hash_contexts = extract_hash_contexts_from_pdf_file(text)

# Get URLS + emails:
urls_ec, emails_ec = extract_urls_and_emails_from_pdf_file(cpy_file_path, working_dir)

Expand All @@ -588,7 +645,8 @@ def extract_data_from_pdf(path: str, user_password: str, entry_id: str, max_imag
urls_ec,
emails_ec,
images,
max_images=max_images)
max_images=max_images,
hash_contexts=hash_contexts)

return_results(readpdf_entry_object)
else:
Expand Down
10 changes: 8 additions & 2 deletions Packs/CommonScripts/Scripts/ReadPDFFileV2/ReadPDFFileV2.yml
Expand Up @@ -9,13 +9,19 @@ args:
description: Maximum number of images to extract from the PDF file.
name: maxImages
comment: |-
Load a PDF file's content and metadata into context.
Load a PDF file's content and metadata into context. Supports extraction of hashes, urls, and emails when available.
commonfields:
id: ReadPDFFileV2
version: -1
enabled: true
name: ReadPDFFileV2
outputs:
- contextPath: Hashes.type
description: The hash type extracted from the PDF file.
type: String
- contextPath: Hashes.value
description: The hash value extracted from the PDF file.
type: String
- contextPath: URL.Data
description: A list of URLs that were extracted from the PDF file.
type: String
Expand Down Expand Up @@ -116,7 +122,7 @@ tags:
- ingestion
timeout: "0"
type: python
dockerimage: demisto/readpdf:1.0.0.83506
dockerimage: demisto/readpdf:1.0.0.85832
runas: DBotRole
tests:
- Extract Indicators From File - Generic v2 - Test
Expand Down
40 changes: 40 additions & 0 deletions Packs/CommonScripts/Scripts/ReadPDFFileV2/ReadPDFFileV2_test.py
Expand Up @@ -13,6 +13,46 @@ def open_html_file(file):
return f.read()


def test_extract_hash_contexts():
"""
Given
- A PDF with hashes in it.
When
- Trying extract the hashes from the file.
Then
- Validate that the hashes were extracted successfully.
"""
from ReadPDFFileV2 import extract_hash_contexts_from_pdf_file, get_pdf_text
expected_hash_contexts = [{'type': 'SHA1', 'value': 'aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d'},
{'type': 'SHA256', 'value': '2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824'},
{'type': 'SHA256', 'value': '8732331accf45f86a00ca823cb24d9806ec1380846a337ac86b4fe6f9d06f1f5'},
{'type': 'MD5', 'value': '5d41402abc4b2a76b9719d911017c592'}]
# We first extract the file's text, and then extract the hashes
pdf_text_output_path = f"{CWD}/PDFText.txt"
file_text = get_pdf_text(f'{CWD}/pdf-with-hashes.pdf', pdf_text_output_path)
hash_contexts = extract_hash_contexts_from_pdf_file(file_text)
assert len(hash_contexts) == len(expected_hash_contexts)
for hash_context in hash_contexts:
assert hash_context in expected_hash_contexts


def test_hash_contexts_in_return_results():
"""
Given
- A hash context to add to the entry context.
When
- Building the entry context.
Then
- Validate that the hash context was added.
"""
from ReadPDFFileV2 import build_readpdf_entry_context
hashes = {'Hashes': [
{'type': 'SHA1', 'value': 'aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d'},
{'type': 'MD5', 'value': '5d41402abc4b2a76b9719d911017c592'}]}
entry_context = build_readpdf_entry_context(hashes)
assert entry_context == hashes


def test_urls_are_found_correctly(mocker):
"""
Given
Expand Down
11 changes: 11 additions & 0 deletions Packs/CommonScripts/Scripts/ReadPDFFileV2/test_data/PDFText.txt
@@ -0,0 +1,11 @@
Dummy PDF
SHA256: 2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824
MD5: 5d41402abc4b2a76b9719d911017c592
SHA1: aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d
SHA1: aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d
SHA256: 8732331accf45f86a00ca823cb24d9806ec1380846a337ac86b4fe6f9d06f1f5
some@example.com
IPs: 123.123.123.123, 107.66.225.91
URLs: google.com, https://example.com/


Binary file not shown.
2 changes: 1 addition & 1 deletion Packs/CommonScripts/pack_metadata.json
Expand Up @@ -2,7 +2,7 @@
"name": "Common Scripts",
"description": "Frequently used scripts pack.",
"support": "xsoar",
"currentVersion": "1.13.23",
"currentVersion": "1.13.24",
"author": "Cortex XSOAR",
"url": "https://www.paloaltonetworks.com/cortex",
"email": "",
Expand Down

0 comments on commit e332fb2

Please sign in to comment.