diff --git a/crawler/model/book_scraper.py b/crawler/model/book_scraper.py index 7c5772f..a6bf397 100644 --- a/crawler/model/book_scraper.py +++ b/crawler/model/book_scraper.py @@ -1,31 +1,33 @@ +import re + import mechanize from lxml import etree class BookScraper: + def __init__(self, id): self.id = id def readers(self): br = mechanize.Browser() - has_more_readers = True start = 0 result = [] while(has_more_readers): - has_more_readers = False response = br.open("http://book.douban.com/subject/" + str(self.id) + "/collections?start=" + str(start)) - page_content = response.read() page_dom = etree.HTML(page_content) links = page_dom.xpath("//div[@id='collections_tab']//div[@class='sub_ins']//div[@class='pl2']//a") - urls = map(lambda link: link.get("href"), links) - print urls + reader_urls = map(lambda link: link.get("href"), links) + reader_ids = map(lambda url: re.search("http://book.douban.com/people/(.+)/", url).group(0), reader_urls) + + has_more_readers = len(reader_ids) > 0 + + for reader in reader_ids: + result.append(reader) - for url in urls: - has_more_readers = True - result.append(url) start += 20 return result \ No newline at end of file