get reader ids

gigix · Mar 3, 2012 · 09a7344 · 09a7344
1 parent 048cfad
commit 09a7344
Showing 1 changed file with 10 additions and 8 deletions.
diff --git a/crawler/model/book_scraper.py b/crawler/model/book_scraper.py
@@ -1,31 +1,33 @@
+import re
+
 import mechanize
 
 from lxml import etree
 
 class BookScraper:
+
 	def __init__(self, id):
 		self.id = id
 
 	def readers(self):
 		br = mechanize.Browser()
-
 		has_more_readers = True
 		start = 0
 		result = []
 
 		while(has_more_readers):
-			has_more_readers = False
 			response = br.open("http://book.douban.com/subject/" + str(self.id) + "/collections?start=" + str(start))
-
 			page_content = response.read()
 			page_dom = etree.HTML(page_content)
 			links = page_dom.xpath("//div[@id='collections_tab']//div[@class='sub_ins']//div[@class='pl2']//a")
-			urls = map(lambda link: link.get("href"), links)
-			print urls
+			reader_urls = map(lambda link: link.get("href"), links)
+			reader_ids = map(lambda url: re.search("http://book.douban.com/people/(.+)/", url).group(0), reader_urls)
+
+			has_more_readers = len(reader_ids) > 0
+
+			for reader in reader_ids:
+				result.append(reader)
 
-			for url in urls:
-				has_more_readers = True
-				result.append(url)
 			start += 20
 
 		return result