Browse files

* Tagging GScraper version 0.2.1.

  • Loading branch information...
1 parent 57094a7 commit fdc685284889ad4caf4adb30343b62d0919ad59e @postmodern postmodern committed Aug 28, 2008
View
5 History.txt
@@ -1,3 +1,8 @@
+== 0.2.1 / 2008-08-27
+
+* Updated XPath queries in GScraper::Search::WebQuery for new Google (tm)
+ Search Result HTML schema.
+
== 0.2.0 / 2008-05-10
* Removed GScraper::WebAgent.
View
2 README.txt
@@ -42,7 +42,7 @@ GScraper is a web-scraping interface to various Google Services.
q = GScraper::Search.query_from_url('http://www.google.com/search?as_q=ruby&as_epq=&as_oq=rails&as_ft=i&as_qdr=all&as_occt=body&as_rights=%28cc_publicdomain%7Ccc_attribute%7Ccc_sharealike%7Ccc_noncommercial%29.-%28cc_nonderived%29')
- q.query # =>; "ruby"
+ q.query # => "ruby"
q.with_words # => "rails"
q.occurrs_within # => :title
q.rights # => :cc_by_nc
View
4 lib/gscraper/search/ajax_query.rb
@@ -161,9 +161,9 @@ def page(page_index)
hash['results'].each_with_index do |result,index|
rank = rank_offset + (index + 1)
title = Hpricot(result['title']).inner_text
- url = result['unescapedUrl']
+ url = URI(result['unescapedUrl'])
summary = Hpricot(result['content']).inner_text
- cached_url = result['cacheUrl']
+ cached_url = URI(result['cacheUrl'])
new_page << Result.new(rank,title,url,summary,cached_url)
end
View
32 lib/gscraper/search/query.rb
@@ -119,19 +119,15 @@ def expression
expr = []
append_modifier = lambda { |name|
- modifier = instance_variable_get("@#{name}")
+ modifier = format_modifier(instance_variable_get("@#{name}"))
- expr << "#{name}:#{modifier}" if modifier
+ expr << "#{name}:#{modifier}" unless modifier.empty?
}
append_options = lambda { |name|
- ops = instance_variable_get("@#{name}")
+ ops = format_options(instance_variable_get("@#{name}"))
- if ops.kind_of?(Array)
- expr << "#{name}:#{ops.join(' ')}"
- elsif ops
- expr << "#{name}:#{ops}"
- end
+ expr << "#{name}:#{ops}" unless ops.empty?
}
expr << @query if @query
@@ -168,6 +164,26 @@ def expression
return expr.join(' ')
end
+ protected
+
+ def format_modifier(value)
+ if value.kind_of?(Regexp)
+ return value.source
+ else
+ return value.to_s
+ end
+ end
+
+ def format_options(value)
+ if value.kind_of?(Array)
+ return value.map { |element|
+ format_modifier(element)
+ }.join(' ')
+ else
+ return format_modifier(value)
+ end
+ end
+
end
end
end
View
64 lib/gscraper/search/web_query.rb
@@ -50,51 +50,6 @@ class WebQuery < Query
# Results per-page
attr_accessor :results_per_page
- # Search query
- attr_accessor :query
-
- # Search 'link' modifier
- attr_accessor :link
-
- # Search 'related' modifier
- attr_accessor :related
-
- # Search 'info' modifier
- attr_accessor :info
-
- # Search 'site' modifier
- attr_accessor :site
-
- # Search 'filetype' modifier
- attr_accessor :filetype
-
- # Search 'allintitle' modifier
- attr_accessor :allintitle
-
- # Search 'intitle' modifier
- attr_accessor :intitle
-
- # Search 'allinurl' modifier
- attr_accessor :allinurl
-
- # Search 'inurl' modifier
- attr_accessor :inurl
-
- # Search 'allintext' modifier
- attr_accessor :allintext
-
- # Search 'intext' modifier
- attr_accessor :intext
-
- # Search for results containing the exact phrase
- attr_accessor :exact_phrase
-
- # Search for results with the words
- attr_accessor :with_words
-
- # Search for results with-out the words
- attr_accessor :without_words
-
# Search for results written in the language
attr_accessor :language
@@ -119,9 +74,6 @@ class WebQuery < Query
# Search for results within the past year
attr_accessor :within_past_year
- # Search for results containing numbers between the range
- attr_accessor :numeric_range
-
# Search for results where the query ocurrs within the area
attr_accessor :occurrs_within
@@ -386,32 +338,32 @@ def page_url(page_index)
def page(page_index)
Page.new do |new_page|
doc = @agent.get(page_url(page_index))
- results = doc.search('//div.g')[0...@results_per_page.to_i]
+ results = doc.search('//li.g|//li/div.g')[0...@results_per_page.to_i]
rank_offset = result_offset_of(page_index)
results.each_with_index do |result,index|
rank = rank_offset + (index + 1)
link = result.at('//a.l')
title = link.inner_text
- url = link.get_attribute('href')
+ url = URI(link.get_attribute('href'))
summary_text = ''
cached_url = nil
similar_url = nil
- if (content = (result.at('//td.j//font|//td.j/div')))
+ if (content = (result.at('//div.s|//td.j//font')))
content.children.each do |elem|
break if (!(elem.text?) && elem.name=='br')
summary_text << elem.inner_text
end
- if (cached_link = result.at('nobr/a:first'))
- cached_url = cached_link.get_attribute('href')
+ if (cached_link = result.at('span.gl/a:first'))
+ cached_url = URI(cached_link.get_attribute('href'))
end
- if (similar_link = result.at('nobr/a:last'))
- similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
+ if (similar_link = result.at('span.gl/a:last'))
+ similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
end
end
@@ -445,7 +397,7 @@ def sponsored_links
# top and side ads
doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
title = link.inner_text
- url = "http://#{SEARCH_HOST}" + link.get_attribute('href')
+ url = URI("http://#{SEARCH_HOST}" + link.get_attribute('href'))
links << SponsoredAd.new(title,url)
end
View
4 lib/gscraper/sponsored_ad.rb
@@ -43,9 +43,7 @@ def initialize(title,url)
# Returns the direct URL of the ad.
#
def direct_url
- uri = URI(@url)
-
- return (uri.query_params['adurl'] || uri.query_params['q'])
+ URI(@url.query_params['adurl'] || @url.query_params['q'])
end
#
View
2 lib/gscraper/version.rb
@@ -21,5 +21,5 @@
#
module GScraper
- VERSION = '0.2.0'
+ VERSION = '0.2.1'
end
View
3 spec/extensions/uri/http_spec.rb
@@ -1,5 +1,4 @@
-require 'pathname'
-require Pathname(__FILE__).dirname.join('..','..','spec_helper').expand_path
+require 'spec_helper'
require 'gscraper/extensions/uri'
View
3 spec/extensions/uri/query_params_spec.rb
@@ -1,5 +1,4 @@
-require 'pathname'
-require Pathname(__FILE__).dirname.join('..','..','spec_helper').expand_path
+require 'spec_helper'
require 'gscraper/extensions/uri'
View
3 spec/gscraper_spec.rb
@@ -1,5 +1,4 @@
-require 'pathname'
-require Pathname(__FILE__).dirname.join('spec_helper').expand_path
+require 'spec_helper'
require 'gscraper/gscraper'
View
3 spec/has_pages_examples.rb
@@ -1,5 +1,4 @@
-require 'pathname'
-require Pathname(__FILE__).dirname.join('spec_helper').expand_path
+require 'spec_helper'
shared_examples_for "has Pages" do
View
19 spec/has_sponsored_links_examples.rb
@@ -1,5 +1,4 @@
-require 'pathname'
-require Pathname(__FILE__).dirname.join('spec_helper').expand_path
+require 'spec_helper'
shared_examples_for "has Sponsored Links" do
@@ -25,15 +24,9 @@
end
end
- it "should have non-empty URLs" do
- @links.each_url do |url|
- url.length.should_not == 0
- end
- end
-
it "should have valid URLs" do
@links.each_url do |url|
- url_should_be_valid(url)
+ uri_should_be_valid(url)
end
end
@@ -43,15 +36,9 @@
end
end
- it "should have non-empty direct URLs" do
- @links.each_direct_url do |url|
- url.length.should_not == 0
- end
- end
-
it "should have valid direct URLs" do
@links.each_direct_url do |url|
- url_should_be_valid(url)
+ uri_should_be_valid(url)
end
end
View
3 spec/helpers/uri.rb
@@ -1,7 +1,6 @@
require 'uri'
-def url_should_be_valid(url)
- uri = URI(url)
+def uri_should_be_valid(uri)
uri.scheme.should_not be_nil
uri.host.should_not be_nil
uri.path.should_not be_nil
View
3 spec/page_has_results_examples.rb
@@ -1,5 +1,4 @@
-require 'pathname'
-require Pathname(__FILE__).dirname.join('spec_helper').expand_path
+require 'spec_helper'
shared_examples_for "Page has Results" do
View
9 spec/search/ajax_query_spec.rb
@@ -1,8 +1,7 @@
-require 'pathname'
-require Pathname(__FILE__).dirname.join('..','spec_helper').expand_path
-require Pathname(__FILE__).dirname.join('..','has_pages_examples').expand_path
-require Pathname(__FILE__).dirname.join('..','page_has_results_examples').expand_path
-require Pathname(__FILE__).dirname.join('page_has_results_examples').expand_path
+require 'spec_helper'
+require 'has_pages_examples'
+require 'page_has_results_examples'
+require 'search/page_has_results_examples'
require 'gscraper/search/ajax_query'
View
17 spec/search/page_has_results_examples.rb
@@ -1,5 +1,4 @@
-require 'pathname'
-require Pathname(__FILE__).dirname.join('..','spec_helper').expand_path
+require 'spec_helper'
shared_examples_for "Page has Search Results" do
@@ -29,24 +28,14 @@
end
end
- it "should have non-empty URLs" do
- @page.each_url do |url|
- url.length.should_not == 0
- end
- end
-
it "should have valid URLs" do
@page.each_url do |url|
- url_should_be_valid(url)
+ uri_should_be_valid(url)
end
end
it "should have atleast one cached URL" do
- @page.cached_urls.should_not == 0
- end
-
- it "should have atleast one similar query URL" do
- @page.similar_urls.should_not == 0
+ @page.cached_urls.length.should_not == 0
end
end
View
3 spec/search/query_spec.rb
@@ -1,5 +1,4 @@
-require 'pathname'
-require Pathname(__FILE__).dirname.join('..','spec_helper').expand_path
+require 'spec_helper'
require 'gscraper/search/query'
View
15 spec/search/web_query_spec.rb
@@ -1,9 +1,8 @@
-require 'pathname'
-require Pathname(__FILE__).dirname.join('..','spec_helper').expand_path
-require Pathname(__FILE__).dirname.join('..','has_pages_examples').expand_path
-require Pathname(__FILE__).dirname.join('..','page_has_results_examples').expand_path
-require Pathname(__FILE__).dirname.join('..','has_sponsored_links_examples').expand_path
-require Pathname(__FILE__).dirname.join('page_has_results_examples').expand_path
+require 'spec_helper'
+require 'has_pages_examples'
+require 'page_has_results_examples'
+require 'has_sponsored_links_examples'
+require 'search/page_has_results_examples'
require 'gscraper/search/web_query'
@@ -72,4 +71,8 @@
end
+ it "should have atleast one similar query URL" do
+ @page.similar_urls.length.should_not == 0
+ end
+
end
View
5 spec/spec_helper.rb
@@ -1,7 +1,6 @@
require 'rubygems'
gem 'rspec', '>=1.1.3'
require 'spec'
-require 'pathname'
-require Pathname(__FILE__).dirname.join('helpers','query').expand_path
-require Pathname(__FILE__).dirname.join('helpers','uri').expand_path
+require 'helpers/query'
+require 'helpers/uri'
View
2 tasks/spec.rb
@@ -2,6 +2,6 @@
desc "Run all specifications"
Spec::Rake::SpecTask.new(:spec) do |t|
- t.libs += [File.expand_path('lib'), File.expand_path('spec')]
+ t.libs += ['lib', 'spec']
t.spec_opts = ['--colour', '--format', 'specdoc']
end

0 comments on commit fdc6852

Please sign in to comment.