Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

crawl book readers

  • Loading branch information...
commit 048cfad9a132322f3952623261b1ea95bb12968d 1 parent f96f5f1
@gigix authored
View
29 crawler/model/book_scraper.py
@@ -1,6 +1,31 @@
+import mechanize
+
+from lxml import etree
+
class BookScraper:
def __init__(self, id):
- pass
+ self.id = id
def readers(self):
- return range(2000)
+ br = mechanize.Browser()
+
+ has_more_readers = True
+ start = 0
+ result = []
+
+ while(has_more_readers):
+ has_more_readers = False
+ response = br.open("http://book.douban.com/subject/" + str(self.id) + "/collections?start=" + str(start))
+
+ page_content = response.read()
+ page_dom = etree.HTML(page_content)
+ links = page_dom.xpath("//div[@id='collections_tab']//div[@class='sub_ins']//div[@class='pl2']//a")
+ urls = map(lambda link: link.get("href"), links)
+ print urls
+
+ for url in urls:
+ has_more_readers = True
+ result.append(url)
+ start += 20
+
+ return result
View
4 crawler/test/book_scraper_test.py
@@ -1,5 +1,5 @@
from ..model.book_scraper import BookScraper
def test_scrape_reader_ids_of_give_book():
- scraper = BookScraper(1140457)
- assert len(scraper.readers()) >= 1064
+ scraper = BookScraper(1766670)
+ assert len(scraper.readers()) == 32
View
BIN  crawler/test/book_scraper_test.pyc
Binary file not shown
View
25 mechanise_sample.py
@@ -1,14 +1,23 @@
import mechanize
-import cookielib
+from lxml import etree
br = mechanize.Browser()
response = br.open("http://book.douban.com/subject/2042269/collections")
-for link in br.links(url_regex="http://book.douban.com/people/(.+)/"):
- print link.url
-
-# print response.read()
+def get_url(element):
+ return element.get("href")
-br.open("http://book.douban.com/people/keeplazy/collect")
-for link in br.links(url_regex="http://book.douban.com/subject/(.+)/"):
- print link.url
+page_content = response.read()
+page_dom = etree.HTML(page_content)
+links = page_dom.xpath("//div[@id='collections_tab']//div[@class='sub_ins']//div[@class='pl2']//a")
+urls = map(get_url, links)
+print urls
+
+# for link in br.links(url_regex="http://book.douban.com/people/(.+)/"):
+# print link.url
+#
+# # print response.read()
+#
+# br.open("http://book.douban.com/people/keeplazy/collect")
+# for link in br.links(url_regex="http://book.douban.com/subject/(.+)/"):
+# print link.url
Please sign in to comment.
Something went wrong with that request. Please try again.