Skip to content
Browse files

Implemented included-URL counting

  • Loading branch information...
1 parent 04da1b4 commit 2273d475b3e2729439230548ce6a3d8c00914a84 @eromba eromba committed Nov 25, 2012
Showing with 27 additions and 4 deletions.
  1. +17 −1 htmlanalyzer.py
  2. +7 −1 test.html
  3. +3 −2 tests.py
View
18 htmlanalyzer.py
@@ -50,7 +50,8 @@ def analyze(self):
'numMetaRefresh': self.countElems('meta', self.isRefresh),
'numHiddenElements': self.countElems('*', self.isHidden),
'numSmallElements': self.countElems('*', self.isSmall),
- 'hasDoubleDocuments': self.hasDoubleDocuments()
+ 'hasDoubleDocuments': self.hasDoubleDocuments(),
+ 'numIncludedUrls': self.numIncludedUrls()
}
##
@@ -123,3 +124,18 @@ def hasDoubleDocuments(self):
for tagName in ['html', 'head', 'title', 'body']:
if (len( self.doc(tagName) ) > 1) : return True
return False
+
+ ##
+ # Returns true if the given attribute is defined for the PyQuery element (this)
+ ##
+ def hasAttr(self, attr):
+ value = PyQuery(this).attr[attr]
+ return ( (value != None) and (len(value) > 0) )
+
+ ##
+ # Counts the number of elements that include external content on the web page
+ ##
+ def numIncludedUrls(self):
+ return ( self.countElems('script, iframe, frame, embed', lambda: self.hasAttr('src')) +
+ self.countElems('form', lambda: self.hasAttr('action')) +
+ self.countElems('object', lambda: self.hasAttr('data')) )
View
8 test.html
@@ -10,15 +10,21 @@
<div style="height: 0; width: 0;">Small div</div>
<div style="height: 1em; width: 1em;">Not-so-small div</div>
<iframe></iframe>
- <iframe></iframe>
+ <iframe src="http://www.google.com"></iframe>
<embed>
+ <embed src="http://www.google.com"></embed>
+ <frame>
+ <frame src="http://www.google.com"></frame>
<object></object>
+ <object data="http://www.google.com"></object>
<a href="http://www.google.com">Google</a>
<script src="http://code.jquery.com/jquery-1.8.2.min.js"></script>
<script src="http://code.jquery.com/jquery-1.8.2.min"></script>
<script src="http://code.jquery.com/jquery#hash"></script>
<script src="http://code.jquery.com/jquery?param=value"></script>
<script src="http://code.jquery.com/jquery/"></script>
+ <form action="">Empty form</form>
+ <form action="http://www.google.com">Non-empty form</form>
<script>
alert("Message");
</script>
View
5 tests.py
@@ -16,13 +16,14 @@ def test_countElements(self):
self.assertEqual(self.result['numIframes'], 2)
self.assertEqual(self.result['numScripts'], 6)
self.assertEqual(self.result['numScriptsWithWrongExtension'], 4)
- self.assertEqual(self.result['numEmbeds'], 1)
- self.assertEqual(self.result['numObjects'], 1)
+ self.assertEqual(self.result['numEmbeds'], 2)
+ self.assertEqual(self.result['numObjects'], 2)
self.assertEqual(self.result['numHyperlinks'], 1)
self.assertEqual(self.result['numMetaRefresh'], 1)
self.assertEqual(self.result['numHiddenElements'], 2)
self.assertEqual(self.result['numSmallElements'], 2)
self.assertEqual(self.result['hasDoubleDocuments'], True)
+ self.assertEqual(self.result['numIncludedUrls'], 10)
if __name__ == '__main__':
unittest.main()

0 comments on commit 2273d47

Please sign in to comment.
Something went wrong with that request. Please try again.