Skip to content

Commit

Permalink
first!
Browse files Browse the repository at this point in the history
  • Loading branch information
davedash committed Jul 31, 2009
0 parents commit 4256027
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
settings.py
settings.pyc
Empty file added README
Empty file.
39 changes: 39 additions & 0 deletions measure.py
@@ -0,0 +1,39 @@
import sys
import settings
import urllib
import json

def get_titles(domain):
url = '/ysearch/web/v1/site:%s?appid=%s&format=json&count=50' % \
(domain, settings.BOSS_API_KEY)
return _extract_titles(url)

def _extract_titles(url):
f = urllib.urlopen(settings.BOSS_HOST+url)
bossobj = json.load(f)
f.close()
titles = []
for result in bossobj['ysearchresponse']['resultset_web']:
titles.append(result['title'])

if bossobj['ysearchresponse'].get('nextpage') and bossobj['ysearchresponse'].get('nextpage') != url:
if settings.DEBUG:
print bossobj['ysearchresponse']['nextpage']
titles += _extract_titles(bossobj['ysearchresponse']['nextpage'])
return titles

def calculate_variety(url):
titles = get_titles(sys.argv[1])
num_titles = len(titles)
print "%d titles found for %s"%(num_titles, url)
unique_titles = len(set(titles))
print "%d unique titles found for %s" %(len(set(titles)), url)
percentage = float(unique_titles)/num_titles * 100
print "%d%% of the pages on %s have unique titles" % (percentage, url)

if __name__ == "__main__":
if len(sys.argv) < 2:
print "Usage: %s domain.com"%sys.argv[0]
sys.exit(1)

calculate_variety(sys.argv[1])
3 changes: 3 additions & 0 deletions settings.py-dist
@@ -0,0 +1,3 @@
DEBUG=True
BOSS_DOMAIN='http://boss.yahooapis.com'
BOSS_API_KEY='' # get your key at: http://developer.yahoo.com/search/boss/

0 comments on commit 4256027

Please sign in to comment.