Tagging GScraper 0.2.2.

ezkl · Jan 15, 2009 · e1ae096 · e1ae096
1 parent 4f6d6b7
commit e1ae096
Show file tree

Hide file tree

Showing 7 changed files with 46 additions and 33 deletions.
diff --git a/History.txt b/History.txt
@@ -1,9 +1,13 @@
-== 0.2.1 / 2008-08-27
+=== 0.2.2 / 2009-01-14
+
+* Updated GScraper::Search::WebQuery to use Nokogiri properly.
+
+=== 0.2.1 / 2008-08-27
 
 * Updated XPath queries in GScraper::Search::WebQuery for new Google (tm)
   Search Result HTML schema.
 
-== 0.2.0 / 2008-05-10
+=== 0.2.0 / 2008-05-10
 
 * Removed GScraper::WebAgent.
 * Added GScraper::Page and GScraper::HasPages.
@@ -18,21 +22,21 @@
 * Added GScraper::Search::AJAXQuery.
 * Replaced Unit Tests with Rspec specifications.
 
-== 0.1.8 / 2008-04-30
+=== 0.1.8 / 2008-04-30
 
 * Added the GScraper.user_agent_alias=(name) method.
 * Added URI::HTTP::QueryParams module.
 * Changed license from MIT to GPL-2.
 
-== 0.1.7 / 2008-04-28
+=== 0.1.7 / 2008-04-28
 
 * Added support for specifing Search modifiers.
 
   Search.query(:filetype => :xls)
 
 * Added the Search::Result#page method.
 
-== 0.1.6 / 2008-03-15
+=== 0.1.6 / 2008-03-15
 
 * Renamed GScraper.http_agent to GScraper.web_agent.
 * Added GScraper.proxy for global proxy configuration.
@@ -43,12 +47,12 @@
   * Added the methods Query#sponsored_links and Query#top_sponsored_link.
 * Added examples to README.txt.
 
-== 0.1.5 / 2007-12-29
+=== 0.1.5 / 2007-12-29
 
 * Fixed class inheritance in gscraper/extensions/uri/http.rb, found by
   sanitybit.
 
-== 0.1.4 / 2007-12-23
+=== 0.1.4 / 2007-12-23
 
 * Added Search::Query#result_at for easier access of a single result at
   a given index.
@@ -63,22 +67,22 @@
 * Fixed various bugs in Search::Query uncovered during unit-testing.
 * Fixed typos in Search::Page's documentation.
 
-== 0.1.3 / 2007-12-22
+=== 0.1.3 / 2007-12-22
 
 * Added the Search::Page class, which contains many of convenance methods
   for searching through the results within a Page.
 
-== 0.1.2 / 2007-12-22
+=== 0.1.2 / 2007-12-22
 
 * Fixed a bug related to extracting the correct content-rights from search
   query URLs.
 * Added GScraper.user_agent_aliases.
 
-== 0.1.1 / 2007-12-21
+=== 0.1.1 / 2007-12-21
 
 * Forgot to include lib/gscraper/version.rb.
 
-== 0.1.0 / 2007-12-20
+=== 0.1.0 / 2007-12-20
 
 * Initial release.
 * Supports the Google Search service.

diff --git a/README.txt b/README.txt
@@ -17,8 +17,7 @@ GScraper is a web-scraping interface to various Google Services.
 
 == REQUIREMENTS:
 
-* Hpricot
-* WWW::Mechanize
+* mechanize >= 0.9.0
 
 == INSTALL:
 

diff --git a/Rakefile b/Rakefile
@@ -10,7 +10,7 @@ Hoe.new('gscraper', GScraper::VERSION) do |p|
   p.rubyforge_name = 'gscraper'
   p.developer('Postmodern', 'postmodern.mod3@gmail.com')
   p.remote_rdoc_dir = ''
-  p.extra_deps = ['hpricot', 'mechanize']
+  p.extra_deps = [['mechanize', '>=0.9.0']]
 end
 
 # vim: syntax=Ruby
diff --git a/lib/gscraper/gscraper.rb b/lib/gscraper/gscraper.rb
@@ -48,10 +48,12 @@ def GScraper.proxy
   #
   def GScraper.proxy_uri(proxy_info=GScraper.proxy)
     if GScraper.proxy[:host]
-      return URI::HTTP.build(:host => GScraper.proxy[:host],
-                             :port => GScraper.proxy[:port],
-                             :userinfo => "#{GScraper.proxy[:user]}:#{GScraper.proxy[:password]}",
-                             :path => '/')
+      return URI::HTTP.build(
+        :host => GScraper.proxy[:host],
+        :port => GScraper.proxy[:port],
+        :userinfo => "#{GScraper.proxy[:user]}:#{GScraper.proxy[:password]}",
+        :path => '/'
+      )
     end
   end
 

diff --git a/lib/gscraper/has_pages.rb b/lib/gscraper/has_pages.rb
@@ -107,6 +107,9 @@ def result_index_of(rank)
       ((rank.to_i - 1) % results_per_page.to_i)
     end
 
+    #
+    # The cache of previously requested pages.
+    #
     def page_cache
       @page_cache ||= Hash.new { |hash,key| hash[key] = page(key.to_i) }
     end

diff --git a/lib/gscraper/search/web_query.rb b/lib/gscraper/search/web_query.rb
@@ -30,8 +30,6 @@
 require 'gscraper/licenses'
 require 'gscraper/gscraper'
 
-require 'hpricot'
-
 module GScraper
   module Search
     class WebQuery < Query
@@ -164,7 +162,11 @@ def initialize(options={},&block)
       def self.from_url(url,options={},&block)
         url = URI(url.to_s)
 
-        options[:results_per_page] = url.query_params['num'].to_i
+        if url.query_params['num']
+          options[:results_per_page] = url.query_params['num'].to_i
+        else
+          options[:results_per_page] = RESULTS_PER_PAGE
+        end
 
         options[:query] = url.query_params['q']
         options[:exact_phrase] = url.query_params['as_epq']
@@ -338,33 +340,36 @@ def page_url(page_index)
       def page(page_index)
         Page.new do |new_page|
           doc = @agent.get(page_url(page_index))
-          results = doc.search('//li.g|//li/div.g')[0...@results_per_page.to_i]
+          results = doc.search('li.g','li/div.g')
 
           rank_offset = result_offset_of(page_index)
 
-          results.each_with_index do |result,index|
+          (0...@results_per_page).each do |index|
+            result = results[index]
+
             rank = rank_offset + (index + 1)
-            link = result.at('//a.l')
+            link = result.at('a.l')
             title = link.inner_text
             url = URI(link.get_attribute('href'))
             summary_text = ''
             cached_url = nil
             similar_url = nil
 
-            if (content = (result.at('//div.s|//td.j//font')))
+            if (content = (result.at('div.s','td.j//font')))
               content.children.each do |elem|
                 break if (!(elem.text?) && elem.name=='br')
 
                 summary_text << elem.inner_text
               end
 
-              if (cached_link = result.at('span.gl/a:first'))
-                cached_url = URI(cached_link.get_attribute('href'))
-              end
+            end
 
-              if (similar_link = result.at('span.gl/a:last'))
-                similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
-              end
+            if (cached_link = result.at('span.gl/a:first'))
+              cached_url = URI(cached_link.get_attribute('href'))
+            end
+
+            if (similar_link = result.at('span.gl/a:last'))
+              similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
             end
 
             new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
@@ -395,7 +400,7 @@ def sponsored_links
           doc = @agent.get(search_url)
 
           # top and side ads
-          doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
+          doc.search('#pa1', 'a[@id^="an"]').each do |link|
             title = link.inner_text
             url = URI("http://#{SEARCH_HOST}" + link.get_attribute('href'))
 

diff --git a/lib/gscraper/version.rb b/lib/gscraper/version.rb
@@ -21,5 +21,5 @@
 #
 
 module GScraper
-  VERSION = '0.2.1'
+  VERSION = '0.2.2'
 end