Skip to content

Commit

Permalink
crawl book readers
Browse files Browse the repository at this point in the history
  • Loading branch information
gigix committed Mar 3, 2012
1 parent f96f5f1 commit 048cfad
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 12 deletions.
29 changes: 27 additions & 2 deletions crawler/model/book_scraper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,31 @@
import mechanize

from lxml import etree

class BookScraper:
def __init__(self, id):
pass
self.id = id

def readers(self):
return range(2000)
br = mechanize.Browser()

has_more_readers = True
start = 0
result = []

while(has_more_readers):
has_more_readers = False
response = br.open("http://book.douban.com/subject/" + str(self.id) + "/collections?start=" + str(start))

page_content = response.read()
page_dom = etree.HTML(page_content)
links = page_dom.xpath("//div[@id='collections_tab']//div[@class='sub_ins']//div[@class='pl2']//a")
urls = map(lambda link: link.get("href"), links)
print urls

for url in urls:
has_more_readers = True
result.append(url)
start += 20

return result
4 changes: 2 additions & 2 deletions crawler/test/book_scraper_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from ..model.book_scraper import BookScraper

def test_scrape_reader_ids_of_give_book():
scraper = BookScraper(1140457)
assert len(scraper.readers()) >= 1064
scraper = BookScraper(1766670)
assert len(scraper.readers()) == 32
Binary file modified crawler/test/book_scraper_test.pyc
Binary file not shown.
25 changes: 17 additions & 8 deletions mechanise_sample.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
import mechanize
import cookielib
from lxml import etree

br = mechanize.Browser()
response = br.open("http://book.douban.com/subject/2042269/collections")

for link in br.links(url_regex="http://book.douban.com/people/(.+)/"):
print link.url

# print response.read()
def get_url(element):
return element.get("href")

br.open("http://book.douban.com/people/keeplazy/collect")
for link in br.links(url_regex="http://book.douban.com/subject/(.+)/"):
print link.url
page_content = response.read()
page_dom = etree.HTML(page_content)
links = page_dom.xpath("//div[@id='collections_tab']//div[@class='sub_ins']//div[@class='pl2']//a")
urls = map(get_url, links)
print urls

# for link in br.links(url_regex="http://book.douban.com/people/(.+)/"):
# print link.url
#
# # print response.read()
#
# br.open("http://book.douban.com/people/keeplazy/collect")
# for link in br.links(url_regex="http://book.douban.com/subject/(.+)/"):
# print link.url

0 comments on commit 048cfad

Please sign in to comment.