Skip to content
Permalink
Browse files

Fix unicode vs utf8 problem with magesec ruleset (#203)

  • Loading branch information...
gwillem committed May 10, 2018
1 parent 5fc173c commit dc270812bc9a64cfc9394589809c108b07ee3094
Showing with 23 additions and 6 deletions.
  1. +1 −0 .gitignore
  2. +22 −6 mwscan/ruleset.py
@@ -1,3 +1,4 @@
.vscode/
*~
.idea/
.DS_Store
@@ -9,7 +9,8 @@
from requests.exceptions import RequestException
from mwscan import settings

# For very old installs, eg CentOS: https://github.com/magesec/magesec/issues/60
# For very old installs, eg CentOS:
# https://github.com/magesec/magesec/issues/60
try:
requests.packages.urllib3.disable_warnings()
except AttributeError:
@@ -36,14 +37,23 @@ def __init__(self, **kwargs):

def find_whitelist_in_rawrules(self, rawrules):
# Find whitelist hashes from comments, because yara whitelist
# hashing is too slow. See https://github.com/VirusTotal/yara/issues/592
# hashing is too slow. See
# https://github.com/VirusTotal/yara/issues/592

m = re.search(
'/\*[^*]*WHITELIST = (\{.*?\})\s*\*/', rawrules, flags=re.DOTALL)
return set(json.loads(m.group(1)) if m else [])

def get_rules(self):
return self._recursive_fetch(self.rules_url)
rawrules = self._recursive_fetch(self.rules_url)
try:
if type(rawrules) is unicode:
return rawrules.encode('ascii', errors='ignore')
except NameError:
pass # py3

return rawrules


def get_whitelist(self):
if not self.whitelist_url:
@@ -81,7 +91,7 @@ def _get_cache_timestamp_content(self, cachefile):
return mtime, cachedcontent

def _httpget(self, url):
""" Fetch URL and use if-modified-since header, store in cache,
""" Fetch URL and use if-modified-since header, store in cache,
fail if upstream fails """

filename = last_url_path(url)
@@ -106,10 +116,15 @@ def _httpget(self, url):
with open(cachefile, 'wb') as fh:
fh.write(resp.content)

return resp.content.decode()
# py3 vs py2
if type(resp.content) is bytes:
return resp.content.decode('utf-8', errors='ignore')
else:
return resp.content

if resp.status_code == 304:
logging.debug('Upstream {0} is the same as our cache (HTTP 304)'.format(url))
logging.debug(
'Upstream {0} is the same as our cache (HTTP 304)'.format(url))

# Upstream hasn't changed (304) or has err'd
if cachedcontent is not None:
@@ -151,6 +166,7 @@ def include(match):
class Files(RulesProvider):

# initialize with Files(args)

def get_rules(self):
path = self._args.rules
logging.info("Loading {0}".format(self._args.rules))

0 comments on commit dc27081

Please sign in to comment.
You can’t perform that action at this time.