Skip to content

Commit

Permalink
get reader ids
Browse files Browse the repository at this point in the history
  • Loading branch information
gigix committed Mar 3, 2012
1 parent 048cfad commit 09a7344
Showing 1 changed file with 10 additions and 8 deletions.
18 changes: 10 additions & 8 deletions crawler/model/book_scraper.py
@@ -1,31 +1,33 @@
import re

import mechanize

from lxml import etree

class BookScraper:

def __init__(self, id):
self.id = id

def readers(self):
br = mechanize.Browser()

has_more_readers = True
start = 0
result = []

while(has_more_readers):
has_more_readers = False
response = br.open("http://book.douban.com/subject/" + str(self.id) + "/collections?start=" + str(start))

page_content = response.read()
page_dom = etree.HTML(page_content)
links = page_dom.xpath("//div[@id='collections_tab']//div[@class='sub_ins']//div[@class='pl2']//a")
urls = map(lambda link: link.get("href"), links)
print urls
reader_urls = map(lambda link: link.get("href"), links)
reader_ids = map(lambda url: re.search("http://book.douban.com/people/(.+)/", url).group(0), reader_urls)

has_more_readers = len(reader_ids) > 0

for reader in reader_ids:
result.append(reader)

for url in urls:
has_more_readers = True
result.append(url)
start += 20

return result

0 comments on commit 09a7344

Please sign in to comment.