Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Implemented double-document detection

  • Loading branch information...
commit 04da1b49346025189a57265b23cdf7b729f116bd 1 parent 621b6a9
Ethan Romba eromba authored

Showing 3 changed files with 13 additions and 1 deletion. Show diff stats Hide diff stats

  1. +10 1 htmlanalyzer.py
  2. +2 0  test.html
  3. +1 0  tests.py
11 htmlanalyzer.py
@@ -49,7 +49,8 @@ def analyze(self):
49 49 'numHyperlinks': self.countElems('a'),
50 50 'numMetaRefresh': self.countElems('meta', self.isRefresh),
51 51 'numHiddenElements': self.countElems('*', self.isHidden),
52   - 'numSmallElements': self.countElems('*', self.isSmall)
  52 + 'numSmallElements': self.countElems('*', self.isSmall),
  53 + 'hasDoubleDocuments': self.hasDoubleDocuments()
53 54 }
54 55
55 56 ##
@@ -114,3 +115,11 @@ def isSmall(self):
114 115 width = self.getDimension(elem, 'width')
115 116 height = self.getDimension(elem, 'height')
116 117 return ( (width != None and (width <= self.smallElementThreshold)) or (height != None and (height <= self.smallElementThreshold)) )
  118 +
  119 + ##
  120 + # Returns true if the document has more than one html, head, title, or body element
  121 + ##
  122 + def hasDoubleDocuments(self):
  123 + for tagName in ['html', 'head', 'title', 'body']:
  124 + if (len( self.doc(tagName) ) > 1) : return True
  125 + return False
2  test.html
@@ -8,6 +8,7 @@
8 8 <div style=" visibility: hidden;">Hidden text</div>
9 9 <div style="height: 1px; width: 1px;">Small div</div>
10 10 <div style="height: 0; width: 0;">Small div</div>
  11 + <div style="height: 1em; width: 1em;">Not-so-small div</div>
11 12 <iframe></iframe>
12 13 <iframe></iframe>
13 14 <embed>
@@ -22,4 +23,5 @@
22 23 alert("Message");
23 24 </script>
24 25 </body>
  26 + <body>Double document</body>
25 27 </html>
1  tests.py
@@ -22,6 +22,7 @@ def test_countElements(self):
22 22 self.assertEqual(self.result['numMetaRefresh'], 1)
23 23 self.assertEqual(self.result['numHiddenElements'], 2)
24 24 self.assertEqual(self.result['numSmallElements'], 2)
  25 + self.assertEqual(self.result['hasDoubleDocuments'], True)
25 26
26 27 if __name__ == '__main__':
27 28 unittest.main()

0 comments on commit 04da1b4

Please sign in to comment.
Something went wrong with that request. Please try again.