Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Implemented double-document detection

  • Loading branch information...
commit 04da1b49346025189a57265b23cdf7b729f116bd 1 parent 621b6a9
@eromba eromba authored
Showing with 13 additions and 1 deletion.
  1. +10 −1 htmlanalyzer.py
  2. +2 −0  test.html
  3. +1 −0  tests.py
View
11 htmlanalyzer.py
@@ -49,7 +49,8 @@ def analyze(self):
'numHyperlinks': self.countElems('a'),
'numMetaRefresh': self.countElems('meta', self.isRefresh),
'numHiddenElements': self.countElems('*', self.isHidden),
- 'numSmallElements': self.countElems('*', self.isSmall)
+ 'numSmallElements': self.countElems('*', self.isSmall),
+ 'hasDoubleDocuments': self.hasDoubleDocuments()
}
##
@@ -114,3 +115,11 @@ def isSmall(self):
width = self.getDimension(elem, 'width')
height = self.getDimension(elem, 'height')
return ( (width != None and (width <= self.smallElementThreshold)) or (height != None and (height <= self.smallElementThreshold)) )
+
+ ##
+ # Returns true if the document has more than one html, head, title, or body element
+ ##
+ def hasDoubleDocuments(self):
+ for tagName in ['html', 'head', 'title', 'body']:
+ if (len( self.doc(tagName) ) > 1) : return True
+ return False
View
2  test.html
@@ -8,6 +8,7 @@
<div style=" visibility: hidden;">Hidden text</div>
<div style="height: 1px; width: 1px;">Small div</div>
<div style="height: 0; width: 0;">Small div</div>
+ <div style="height: 1em; width: 1em;">Not-so-small div</div>
<iframe></iframe>
<iframe></iframe>
<embed>
@@ -22,4 +23,5 @@
alert("Message");
</script>
</body>
+ <body>Double document</body>
</html>
View
1  tests.py
@@ -22,6 +22,7 @@ def test_countElements(self):
self.assertEqual(self.result['numMetaRefresh'], 1)
self.assertEqual(self.result['numHiddenElements'], 2)
self.assertEqual(self.result['numSmallElements'], 2)
+ self.assertEqual(self.result['hasDoubleDocuments'], True)
if __name__ == '__main__':
unittest.main()
Please sign in to comment.
Something went wrong with that request. Please try again.