Skip to content

Commit

Permalink
Merge branch 'truls-master' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
firecat53 committed Oct 25, 2018
2 parents d5f4ef4 + 242ff86 commit 4e9770e
Showing 1 changed file with 31 additions and 10 deletions.
41 changes: 31 additions & 10 deletions urlscan/urlscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,14 +184,18 @@ def handle_startendtag(self, tag, attrs):

def handle_endtag(self, tag):
if tag == 'a':
del self.anchor_stack[-1]
if len(self.anchor_stack) > 1:
del self.anchor_stack[-1]
elif tag in HTMLChunker.tag_styles:
del self.style_stack[-1]
if len(self.style_stack) > 1:
del self.style_stack[-1]
elif tag in ('ul', 'ol'):
del self.list_stack[-1]
if len(self.list_stack) > 0:
del self.list_stack[-1]
self.end_para()
elif isheadertag(tag):
del self.style_stack[-1]
if len(self.style_stack) > 1:
del self.style_stack[-1]
self.end_para()
elif tag in ('style', 'script'):
self.in_style_or_script = False
Expand Down Expand Up @@ -456,6 +460,23 @@ def decode_bytes(byt, enc='utf-8'):
return strg


def decode_msg(msg, enc='utf-8'):
"""
Decodes a message fragment.
Args: msg - A Message object representing the fragment
enc - The encoding to use for decoding the message
"""
# We avoid the get_payload decoding machinery for raw
# content-transfer-encodings potentially containing non-ascii characters,
# such as 8bit or binary, as these are encoded using raw-unicode-escape which
# seems to prevent subsequent utf-8 decoding.
cte = str(msg.get('content-transfer-encoding', '')).lower()
decode = cte not in ("8bit", "7bit", "binary")
res = msg.get_payload(decode=decode)
return decode_bytes(res, enc)


def msgurls(msg, urlidx=1):
"""Main entry function for urlscan.py
Expand All @@ -470,13 +491,13 @@ def msgurls(msg, urlidx=1):
for chunk in msgurls(part, urlidx):
urlidx += 1
yield chunk
elif msg.get_content_type() == 'text/plain':
msg = decode_bytes(msg.get_payload(decode=True), enc)
for chunk in extracturls(msg):
elif msg.get_content_type() == "text/plain":
decoded = decode_msg(msg, enc)
for chunk in extracturls(decoded):
urlidx += 1
yield chunk
elif msg.get_content_type() == 'text/html':
msg = decode_bytes(msg.get_payload(decode=True), enc)
for chunk in extracthtmlurls(msg):
elif msg.get_content_type() == "text/html":
decoded = decode_msg(msg, enc)
for chunk in extracthtmlurls(decoded):
urlidx += 1
yield chunk

0 comments on commit 4e9770e

Please sign in to comment.