Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Marketplace Contribution] Common Scripts - Content Pack Update #27790

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions Packs/CommonScripts/ReleaseNotes/1_11_90.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#### Scripts
##### TextFromHTML

- Added an optional argument ***html_tag*** to extract text from within this tag.
- Updated the Docker image to: *demisto/python3:3.10.12.63474*.
24 changes: 15 additions & 9 deletions Packs/CommonScripts/Scripts/TextFromHTML/TextFromHTML.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,28 @@
import demistomock as demisto # noqa: F401
from CommonServerPython import * # noqa: F401


import re


def text_from_html(args):
html = args['html']
body = re.search(r'<body.*/body>', html, re.M + re.S + re.I + re.U)
if body and body.group(0):
data = re.sub(r'<.*?>', '', body.group(0))
def get_plain_text(html_regex):
data = ''
if html_regex and html_regex.group(0):
data = re.sub(r'<.*?>', '', html_regex.group(0))
entities = {'quot': '"', 'amp': '&', 'apos': "'", 'lt': '<', 'gt': '>', 'nbsp': ' ',
'copy': '(C)', 'reg': '(R)', 'tilde': '~', 'ldquo': '"', 'rdquo': '"', 'hellip': '...'}
for e in entities:
data = data.replace('&' + e + ';', entities[e])
data = data.replace(f'&{e};', entities[e])
return data


def text_from_html(args):
html = args['html']
html_tag = args.get('html_tag', 'body')

return data
else:
return 'Could not extract text'
body = re.search(fr'<{html_tag}.*/{html_tag}>', html, re.M + re.S + re.I + re.U)
data = get_plain_text(body)
return data if data != '' else 'Could not extract text'


if __name__ in ["__builtin__", "builtins"]:
Expand Down
11 changes: 10 additions & 1 deletion Packs/CommonScripts/Scripts/TextFromHTML/TextFromHTML.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,18 @@ args:
required: true
default: true
description: The HTML to strip tags from
- name: html_tag
required: false
default: true
defaultValue: body
description: Specify HTML tag to extract the text from within.
scripttarget: 0
runonce: false
fromversion: 5.0.0
dockerimage: demisto/python3:3.10.6.33415
dockerimage: demisto/python3:3.10.12.63474
tests:
- TextFromHTML_test_playbook
contentitemexportablefields:
contentitemfields:
fromServerVersion: ''
runas: DBotWeakRole
23 changes: 23 additions & 0 deletions Packs/CommonScripts/Scripts/TextFromHTML/TextFromHTML_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
'html': html
}
res = TextFromHTML.text_from_html(args)

Check failure on line 72 in Packs/CommonScripts/Scripts/TextFromHTML/TextFromHTML_test.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (RUF001)

Packs/CommonScripts/Scripts/TextFromHTML/TextFromHTML_test.py:72:25: RUF001 String contains ambiguous `ט` (HEBREW LETTER TET). Did you mean `v` (LATIN SMALL LETTER V)?
assert res == '\nמשפט בעברית לבדיקה\n'


Expand Down Expand Up @@ -155,3 +155,26 @@
res = TextFromHTML.text_from_html(args)

assert res == '\n\nHTML Links\nHTML links are defined with the a tag:\n\nThis is a link\n\n'


def test_extract_text_from_specific_tag():
"""
Given
- html string:
<p>HTML links are defined with the a tag:</p>
When
- extracting text from the html
Then
- ensure we return "HTML links are defined with the a tag:"
"""
import TextFromHTML

html = """<p>HTML links are defined with the a tag:</p>"""

args = {
'html': html,
'html_tag': 'p'
}
res = TextFromHTML.text_from_html(args)

assert res == 'HTML links are defined with the a tag:'
2 changes: 1 addition & 1 deletion Packs/CommonScripts/pack_metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"name": "Common Scripts",
"description": "Frequently used scripts pack.",
"support": "xsoar",
"currentVersion": "1.11.89",
"currentVersion": "1.11.90",
"author": "Cortex XSOAR",
"url": "https://www.paloaltonetworks.com/cortex",
"email": "",
Expand Down