From fdc685284889ad4caf4adb30343b62d0919ad59e Mon Sep 17 00:00:00 2001 From: Postmodern Date: Thu, 28 Aug 2008 00:53:49 +0000 Subject: [PATCH] * Tagging GScraper version 0.2.1. --- History.txt | 5 ++ README.txt | 2 +- lib/gscraper/search/ajax_query.rb | 4 +- lib/gscraper/search/query.rb | 32 +++++++++--- lib/gscraper/search/web_query.rb | 64 +++--------------------- lib/gscraper/sponsored_ad.rb | 4 +- lib/gscraper/version.rb | 2 +- spec/extensions/uri/http_spec.rb | 3 +- spec/extensions/uri/query_params_spec.rb | 3 +- spec/gscraper_spec.rb | 3 +- spec/has_pages_examples.rb | 3 +- spec/has_sponsored_links_examples.rb | 19 ++----- spec/helpers/uri.rb | 3 +- spec/page_has_results_examples.rb | 3 +- spec/search/ajax_query_spec.rb | 9 ++-- spec/search/page_has_results_examples.rb | 17 ++----- spec/search/query_spec.rb | 3 +- spec/search/web_query_spec.rb | 15 +++--- spec/spec_helper.rb | 5 +- tasks/spec.rb | 2 +- 20 files changed, 71 insertions(+), 130 deletions(-) diff --git a/History.txt b/History.txt index 593025d..5cdcc08 100644 --- a/History.txt +++ b/History.txt @@ -1,3 +1,8 @@ +== 0.2.1 / 2008-08-27 + +* Updated XPath queries in GScraper::Search::WebQuery for new Google (tm) + Search Result HTML schema. + == 0.2.0 / 2008-05-10 * Removed GScraper::WebAgent. diff --git a/README.txt b/README.txt index e184368..aa2eb74 100644 --- a/README.txt +++ b/README.txt @@ -42,7 +42,7 @@ GScraper is a web-scraping interface to various Google Services. q = GScraper::Search.query_from_url('http://www.google.com/search?as_q=ruby&as_epq=&as_oq=rails&as_ft=i&as_qdr=all&as_occt=body&as_rights=%28cc_publicdomain%7Ccc_attribute%7Ccc_sharealike%7Ccc_noncommercial%29.-%28cc_nonderived%29') - q.query # =>; "ruby" + q.query # => "ruby" q.with_words # => "rails" q.occurrs_within # => :title q.rights # => :cc_by_nc diff --git a/lib/gscraper/search/ajax_query.rb b/lib/gscraper/search/ajax_query.rb index 2b7d7da..b1f1f57 100644 --- a/lib/gscraper/search/ajax_query.rb +++ b/lib/gscraper/search/ajax_query.rb @@ -161,9 +161,9 @@ def page(page_index) hash['results'].each_with_index do |result,index| rank = rank_offset + (index + 1) title = Hpricot(result['title']).inner_text - url = result['unescapedUrl'] + url = URI(result['unescapedUrl']) summary = Hpricot(result['content']).inner_text - cached_url = result['cacheUrl'] + cached_url = URI(result['cacheUrl']) new_page << Result.new(rank,title,url,summary,cached_url) end diff --git a/lib/gscraper/search/query.rb b/lib/gscraper/search/query.rb index 47c4e2f..714cb24 100644 --- a/lib/gscraper/search/query.rb +++ b/lib/gscraper/search/query.rb @@ -119,19 +119,15 @@ def expression expr = [] append_modifier = lambda { |name| - modifier = instance_variable_get("@#{name}") + modifier = format_modifier(instance_variable_get("@#{name}")) - expr << "#{name}:#{modifier}" if modifier + expr << "#{name}:#{modifier}" unless modifier.empty? } append_options = lambda { |name| - ops = instance_variable_get("@#{name}") + ops = format_options(instance_variable_get("@#{name}")) - if ops.kind_of?(Array) - expr << "#{name}:#{ops.join(' ')}" - elsif ops - expr << "#{name}:#{ops}" - end + expr << "#{name}:#{ops}" unless ops.empty? } expr << @query if @query @@ -168,6 +164,26 @@ def expression return expr.join(' ') end + protected + + def format_modifier(value) + if value.kind_of?(Regexp) + return value.source + else + return value.to_s + end + end + + def format_options(value) + if value.kind_of?(Array) + return value.map { |element| + format_modifier(element) + }.join(' ') + else + return format_modifier(value) + end + end + end end end diff --git a/lib/gscraper/search/web_query.rb b/lib/gscraper/search/web_query.rb index eec72a4..6f67498 100644 --- a/lib/gscraper/search/web_query.rb +++ b/lib/gscraper/search/web_query.rb @@ -50,51 +50,6 @@ class WebQuery < Query # Results per-page attr_accessor :results_per_page - # Search query - attr_accessor :query - - # Search 'link' modifier - attr_accessor :link - - # Search 'related' modifier - attr_accessor :related - - # Search 'info' modifier - attr_accessor :info - - # Search 'site' modifier - attr_accessor :site - - # Search 'filetype' modifier - attr_accessor :filetype - - # Search 'allintitle' modifier - attr_accessor :allintitle - - # Search 'intitle' modifier - attr_accessor :intitle - - # Search 'allinurl' modifier - attr_accessor :allinurl - - # Search 'inurl' modifier - attr_accessor :inurl - - # Search 'allintext' modifier - attr_accessor :allintext - - # Search 'intext' modifier - attr_accessor :intext - - # Search for results containing the exact phrase - attr_accessor :exact_phrase - - # Search for results with the words - attr_accessor :with_words - - # Search for results with-out the words - attr_accessor :without_words - # Search for results written in the language attr_accessor :language @@ -119,9 +74,6 @@ class WebQuery < Query # Search for results within the past year attr_accessor :within_past_year - # Search for results containing numbers between the range - attr_accessor :numeric_range - # Search for results where the query ocurrs within the area attr_accessor :occurrs_within @@ -386,7 +338,7 @@ def page_url(page_index) def page(page_index) Page.new do |new_page| doc = @agent.get(page_url(page_index)) - results = doc.search('//div.g')[0...@results_per_page.to_i] + results = doc.search('//li.g|//li/div.g')[0...@results_per_page.to_i] rank_offset = result_offset_of(page_index) @@ -394,24 +346,24 @@ def page(page_index) rank = rank_offset + (index + 1) link = result.at('//a.l') title = link.inner_text - url = link.get_attribute('href') + url = URI(link.get_attribute('href')) summary_text = '' cached_url = nil similar_url = nil - if (content = (result.at('//td.j//font|//td.j/div'))) + if (content = (result.at('//div.s|//td.j//font'))) content.children.each do |elem| break if (!(elem.text?) && elem.name=='br') summary_text << elem.inner_text end - if (cached_link = result.at('nobr/a:first')) - cached_url = cached_link.get_attribute('href') + if (cached_link = result.at('span.gl/a:first')) + cached_url = URI(cached_link.get_attribute('href')) end - if (similar_link = result.at('nobr/a:last')) - similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href') + if (similar_link = result.at('span.gl/a:last')) + similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href')) end end @@ -445,7 +397,7 @@ def sponsored_links # top and side ads doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link| title = link.inner_text - url = "http://#{SEARCH_HOST}" + link.get_attribute('href') + url = URI("http://#{SEARCH_HOST}" + link.get_attribute('href')) links << SponsoredAd.new(title,url) end diff --git a/lib/gscraper/sponsored_ad.rb b/lib/gscraper/sponsored_ad.rb index 0d2f7bc..e804131 100644 --- a/lib/gscraper/sponsored_ad.rb +++ b/lib/gscraper/sponsored_ad.rb @@ -43,9 +43,7 @@ def initialize(title,url) # Returns the direct URL of the ad. # def direct_url - uri = URI(@url) - - return (uri.query_params['adurl'] || uri.query_params['q']) + URI(@url.query_params['adurl'] || @url.query_params['q']) end # diff --git a/lib/gscraper/version.rb b/lib/gscraper/version.rb index 7820c40..f27cdc6 100644 --- a/lib/gscraper/version.rb +++ b/lib/gscraper/version.rb @@ -21,5 +21,5 @@ # module GScraper - VERSION = '0.2.0' + VERSION = '0.2.1' end diff --git a/spec/extensions/uri/http_spec.rb b/spec/extensions/uri/http_spec.rb index 9206412..7aa7c69 100644 --- a/spec/extensions/uri/http_spec.rb +++ b/spec/extensions/uri/http_spec.rb @@ -1,5 +1,4 @@ -require 'pathname' -require Pathname(__FILE__).dirname.join('..','..','spec_helper').expand_path +require 'spec_helper' require 'gscraper/extensions/uri' diff --git a/spec/extensions/uri/query_params_spec.rb b/spec/extensions/uri/query_params_spec.rb index b69e3c2..1367d7a 100644 --- a/spec/extensions/uri/query_params_spec.rb +++ b/spec/extensions/uri/query_params_spec.rb @@ -1,5 +1,4 @@ -require 'pathname' -require Pathname(__FILE__).dirname.join('..','..','spec_helper').expand_path +require 'spec_helper' require 'gscraper/extensions/uri' diff --git a/spec/gscraper_spec.rb b/spec/gscraper_spec.rb index bf7ccc6..19f242f 100644 --- a/spec/gscraper_spec.rb +++ b/spec/gscraper_spec.rb @@ -1,5 +1,4 @@ -require 'pathname' -require Pathname(__FILE__).dirname.join('spec_helper').expand_path +require 'spec_helper' require 'gscraper/gscraper' diff --git a/spec/has_pages_examples.rb b/spec/has_pages_examples.rb index 1a68b31..af2daaa 100644 --- a/spec/has_pages_examples.rb +++ b/spec/has_pages_examples.rb @@ -1,5 +1,4 @@ -require 'pathname' -require Pathname(__FILE__).dirname.join('spec_helper').expand_path +require 'spec_helper' shared_examples_for "has Pages" do diff --git a/spec/has_sponsored_links_examples.rb b/spec/has_sponsored_links_examples.rb index 33d1ddb..442a474 100644 --- a/spec/has_sponsored_links_examples.rb +++ b/spec/has_sponsored_links_examples.rb @@ -1,5 +1,4 @@ -require 'pathname' -require Pathname(__FILE__).dirname.join('spec_helper').expand_path +require 'spec_helper' shared_examples_for "has Sponsored Links" do @@ -25,15 +24,9 @@ end end - it "should have non-empty URLs" do - @links.each_url do |url| - url.length.should_not == 0 - end - end - it "should have valid URLs" do @links.each_url do |url| - url_should_be_valid(url) + uri_should_be_valid(url) end end @@ -43,15 +36,9 @@ end end - it "should have non-empty direct URLs" do - @links.each_direct_url do |url| - url.length.should_not == 0 - end - end - it "should have valid direct URLs" do @links.each_direct_url do |url| - url_should_be_valid(url) + uri_should_be_valid(url) end end diff --git a/spec/helpers/uri.rb b/spec/helpers/uri.rb index ac9c017..d7144f9 100644 --- a/spec/helpers/uri.rb +++ b/spec/helpers/uri.rb @@ -1,7 +1,6 @@ require 'uri' -def url_should_be_valid(url) - uri = URI(url) +def uri_should_be_valid(uri) uri.scheme.should_not be_nil uri.host.should_not be_nil uri.path.should_not be_nil diff --git a/spec/page_has_results_examples.rb b/spec/page_has_results_examples.rb index 53f5970..cffa374 100644 --- a/spec/page_has_results_examples.rb +++ b/spec/page_has_results_examples.rb @@ -1,5 +1,4 @@ -require 'pathname' -require Pathname(__FILE__).dirname.join('spec_helper').expand_path +require 'spec_helper' shared_examples_for "Page has Results" do diff --git a/spec/search/ajax_query_spec.rb b/spec/search/ajax_query_spec.rb index be59711..ba8dc6f 100644 --- a/spec/search/ajax_query_spec.rb +++ b/spec/search/ajax_query_spec.rb @@ -1,8 +1,7 @@ -require 'pathname' -require Pathname(__FILE__).dirname.join('..','spec_helper').expand_path -require Pathname(__FILE__).dirname.join('..','has_pages_examples').expand_path -require Pathname(__FILE__).dirname.join('..','page_has_results_examples').expand_path -require Pathname(__FILE__).dirname.join('page_has_results_examples').expand_path +require 'spec_helper' +require 'has_pages_examples' +require 'page_has_results_examples' +require 'search/page_has_results_examples' require 'gscraper/search/ajax_query' diff --git a/spec/search/page_has_results_examples.rb b/spec/search/page_has_results_examples.rb index 6411450..bed9585 100644 --- a/spec/search/page_has_results_examples.rb +++ b/spec/search/page_has_results_examples.rb @@ -1,5 +1,4 @@ -require 'pathname' -require Pathname(__FILE__).dirname.join('..','spec_helper').expand_path +require 'spec_helper' shared_examples_for "Page has Search Results" do @@ -29,24 +28,14 @@ end end - it "should have non-empty URLs" do - @page.each_url do |url| - url.length.should_not == 0 - end - end - it "should have valid URLs" do @page.each_url do |url| - url_should_be_valid(url) + uri_should_be_valid(url) end end it "should have atleast one cached URL" do - @page.cached_urls.should_not == 0 - end - - it "should have atleast one similar query URL" do - @page.similar_urls.should_not == 0 + @page.cached_urls.length.should_not == 0 end end diff --git a/spec/search/query_spec.rb b/spec/search/query_spec.rb index 669f663..47acfb1 100644 --- a/spec/search/query_spec.rb +++ b/spec/search/query_spec.rb @@ -1,5 +1,4 @@ -require 'pathname' -require Pathname(__FILE__).dirname.join('..','spec_helper').expand_path +require 'spec_helper' require 'gscraper/search/query' diff --git a/spec/search/web_query_spec.rb b/spec/search/web_query_spec.rb index 1f0fe55..f883ab7 100644 --- a/spec/search/web_query_spec.rb +++ b/spec/search/web_query_spec.rb @@ -1,9 +1,8 @@ -require 'pathname' -require Pathname(__FILE__).dirname.join('..','spec_helper').expand_path -require Pathname(__FILE__).dirname.join('..','has_pages_examples').expand_path -require Pathname(__FILE__).dirname.join('..','page_has_results_examples').expand_path -require Pathname(__FILE__).dirname.join('..','has_sponsored_links_examples').expand_path -require Pathname(__FILE__).dirname.join('page_has_results_examples').expand_path +require 'spec_helper' +require 'has_pages_examples' +require 'page_has_results_examples' +require 'has_sponsored_links_examples' +require 'search/page_has_results_examples' require 'gscraper/search/web_query' @@ -72,4 +71,8 @@ end + it "should have atleast one similar query URL" do + @page.similar_urls.length.should_not == 0 + end + end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 67bbd08..cf548f0 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -1,7 +1,6 @@ require 'rubygems' gem 'rspec', '>=1.1.3' require 'spec' -require 'pathname' -require Pathname(__FILE__).dirname.join('helpers','query').expand_path -require Pathname(__FILE__).dirname.join('helpers','uri').expand_path +require 'helpers/query' +require 'helpers/uri' diff --git a/tasks/spec.rb b/tasks/spec.rb index c97455f..8273b81 100644 --- a/tasks/spec.rb +++ b/tasks/spec.rb @@ -2,6 +2,6 @@ desc "Run all specifications" Spec::Rake::SpecTask.new(:spec) do |t| - t.libs += [File.expand_path('lib'), File.expand_path('spec')] + t.libs += ['lib', 'spec'] t.spec_opts = ['--colour', '--format', 'specdoc'] end