Skip to content
Browse files

Implemented small-element counting

  • Loading branch information...
1 parent 2398d8c commit e47d98965c2519cc0eb17e9e0584ed46b597c980 @eromba eromba committed Nov 25, 2012
Showing with 44 additions and 2 deletions.
  1. +43 −2 htmlanalyzer.py
  2. +1 −0 tests.py
View
45 htmlanalyzer.py
@@ -13,6 +13,17 @@ class HTMLAnalyzer:
findHiddenStyle = re.compile('(?:display\s*:\s*none)|(?:visibility\s*:\s*hidden)')
+ # These regular expressions match the values of the width and height
+ # CSS rules when specified in "px" or without a unit
+ findCssPropValue = {
+ 'width': re.compile('width\s*:\s*([\.\d-]+)(?:px)?\s*(?:;|$)'),
+ 'height': re.compile('height\s*:\s*([\.\d-]+)(?:px)?\s*(?:;|$)')
+ }
+
+ findNumber = re.compile('([\.\d-]+)')
+
+ smallElementThreshold = 5;
+
def __init__(self, html):
self.load(html)
@@ -36,7 +47,8 @@ def analyze(self):
'numObjects': self.countElems('object'),
'numHyperlinks': self.countElems('a'),
'numMetaRefresh': self.countElems('meta', self.isRefresh),
- 'numHiddenElements': self.countElems('*', self.isHidden)
+ 'numHiddenElements': self.countElems('*', self.isHidden),
+ 'numSmallElements': self.countElems('*', self.isSmall)
}
##
@@ -51,7 +63,7 @@ def countElems(self, tagName, f = None):
##
# Returns true if the PyQuery element (this)
- # has an "http-quiv" attribute with a value of "refresh"
+ # has an "http-equiv" attribute with a value of "refresh"
##
def isRefresh(self):
httpEquiv = PyQuery(this).attr['http-equiv']
@@ -72,3 +84,32 @@ def hasWrongExtension(self):
def isHidden(self):
style = PyQuery(this).attr['style'];
return ( style and (self.findHiddenStyle.search(style) != None) );
+
+ ##
+ # Returns the width or height of the given PyQuery element
+ # (when set explicitly via the style, width, or height attributes)
+ ##
+ def getDimension(self, elem, dim):
+ style = elem.attr['style']
+ attr = elem.attr[dim]
+ # Try to find the value of the CSS width and height properties first,
+ # since they take precedence over the width and height HTML attributes
+ match = ( (style and len(style) > 0 and self.findCssPropValue[dim].search(style)) or
+ (attr and len(attr) > 0 and self.findNumber.search(attr)) or
+ None )
+ if match:
+ value = match.group(1)
+ # Cast to float or int appropriately
+ return (float(value) if '.' in value else int(value))
+ else:
+ return None
+
+ ##
+ # Returns true if either the width or height of the PyQuery element (this)
+ # are less than or equal to smallElementThreshold
+ ##
+ def isSmall(self):
+ elem = PyQuery(this)
+ width = self.getDimension(elem, 'width')
+ height = self.getDimension(elem, 'height')
+ return ( (width != None and (width <= self.smallElementThreshold)) or (height != None and (height <= self.smallElementThreshold)) )
View
1 tests.py
@@ -21,6 +21,7 @@ def test_countElements(self):
self.assertEqual(self.result['numHyperlinks'], 1)
self.assertEqual(self.result['numMetaRefresh'], 1)
self.assertEqual(self.result['numHiddenElements'], 2)
+ self.assertEqual(self.result['numSmallElements'], 2)
if __name__ == '__main__':
unittest.main()

0 comments on commit e47d989

Please sign in to comment.
Something went wrong with that request. Please try again.