Navigation Menu

Skip to content

Commit

Permalink
* Tagging GScraper version 0.2.1.
Browse files Browse the repository at this point in the history
  • Loading branch information
postmodern committed Aug 28, 2008
1 parent 57094a7 commit fdc6852
Show file tree
Hide file tree
Showing 20 changed files with 71 additions and 130 deletions.
5 changes: 5 additions & 0 deletions History.txt
@@ -1,3 +1,8 @@
== 0.2.1 / 2008-08-27

* Updated XPath queries in GScraper::Search::WebQuery for new Google (tm)
Search Result HTML schema.

== 0.2.0 / 2008-05-10 == 0.2.0 / 2008-05-10


* Removed GScraper::WebAgent. * Removed GScraper::WebAgent.
Expand Down
2 changes: 1 addition & 1 deletion README.txt
Expand Up @@ -42,7 +42,7 @@ GScraper is a web-scraping interface to various Google Services.


q = GScraper::Search.query_from_url('http://www.google.com/search?as_q=ruby&as_epq=&as_oq=rails&as_ft=i&as_qdr=all&as_occt=body&as_rights=%28cc_publicdomain%7Ccc_attribute%7Ccc_sharealike%7Ccc_noncommercial%29.-%28cc_nonderived%29') q = GScraper::Search.query_from_url('http://www.google.com/search?as_q=ruby&as_epq=&as_oq=rails&as_ft=i&as_qdr=all&as_occt=body&as_rights=%28cc_publicdomain%7Ccc_attribute%7Ccc_sharealike%7Ccc_noncommercial%29.-%28cc_nonderived%29')


q.query # =>; "ruby" q.query # => "ruby"
q.with_words # => "rails" q.with_words # => "rails"
q.occurrs_within # => :title q.occurrs_within # => :title
q.rights # => :cc_by_nc q.rights # => :cc_by_nc
Expand Down
4 changes: 2 additions & 2 deletions lib/gscraper/search/ajax_query.rb
Expand Up @@ -161,9 +161,9 @@ def page(page_index)
hash['results'].each_with_index do |result,index| hash['results'].each_with_index do |result,index|
rank = rank_offset + (index + 1) rank = rank_offset + (index + 1)
title = Hpricot(result['title']).inner_text title = Hpricot(result['title']).inner_text
url = result['unescapedUrl'] url = URI(result['unescapedUrl'])
summary = Hpricot(result['content']).inner_text summary = Hpricot(result['content']).inner_text
cached_url = result['cacheUrl'] cached_url = URI(result['cacheUrl'])


new_page << Result.new(rank,title,url,summary,cached_url) new_page << Result.new(rank,title,url,summary,cached_url)
end end
Expand Down
32 changes: 24 additions & 8 deletions lib/gscraper/search/query.rb
Expand Up @@ -119,19 +119,15 @@ def expression
expr = [] expr = []


append_modifier = lambda { |name| append_modifier = lambda { |name|
modifier = instance_variable_get("@#{name}") modifier = format_modifier(instance_variable_get("@#{name}"))


expr << "#{name}:#{modifier}" if modifier expr << "#{name}:#{modifier}" unless modifier.empty?
} }


append_options = lambda { |name| append_options = lambda { |name|
ops = instance_variable_get("@#{name}") ops = format_options(instance_variable_get("@#{name}"))


if ops.kind_of?(Array) expr << "#{name}:#{ops}" unless ops.empty?
expr << "#{name}:#{ops.join(' ')}"
elsif ops
expr << "#{name}:#{ops}"
end
} }


expr << @query if @query expr << @query if @query
Expand Down Expand Up @@ -168,6 +164,26 @@ def expression
return expr.join(' ') return expr.join(' ')
end end


protected

def format_modifier(value)
if value.kind_of?(Regexp)
return value.source
else
return value.to_s
end
end

def format_options(value)
if value.kind_of?(Array)
return value.map { |element|
format_modifier(element)
}.join(' ')
else
return format_modifier(value)
end
end

end end
end end
end end
64 changes: 8 additions & 56 deletions lib/gscraper/search/web_query.rb
Expand Up @@ -50,51 +50,6 @@ class WebQuery < Query
# Results per-page # Results per-page
attr_accessor :results_per_page attr_accessor :results_per_page


# Search query
attr_accessor :query

# Search 'link' modifier
attr_accessor :link

# Search 'related' modifier
attr_accessor :related

# Search 'info' modifier
attr_accessor :info

# Search 'site' modifier
attr_accessor :site

# Search 'filetype' modifier
attr_accessor :filetype

# Search 'allintitle' modifier
attr_accessor :allintitle

# Search 'intitle' modifier
attr_accessor :intitle

# Search 'allinurl' modifier
attr_accessor :allinurl

# Search 'inurl' modifier
attr_accessor :inurl

# Search 'allintext' modifier
attr_accessor :allintext

# Search 'intext' modifier
attr_accessor :intext

# Search for results containing the exact phrase
attr_accessor :exact_phrase

# Search for results with the words
attr_accessor :with_words

# Search for results with-out the words
attr_accessor :without_words

# Search for results written in the language # Search for results written in the language
attr_accessor :language attr_accessor :language


Expand All @@ -119,9 +74,6 @@ class WebQuery < Query
# Search for results within the past year # Search for results within the past year
attr_accessor :within_past_year attr_accessor :within_past_year


# Search for results containing numbers between the range
attr_accessor :numeric_range

# Search for results where the query ocurrs within the area # Search for results where the query ocurrs within the area
attr_accessor :occurrs_within attr_accessor :occurrs_within


Expand Down Expand Up @@ -386,32 +338,32 @@ def page_url(page_index)
def page(page_index) def page(page_index)
Page.new do |new_page| Page.new do |new_page|
doc = @agent.get(page_url(page_index)) doc = @agent.get(page_url(page_index))
results = doc.search('//div.g')[0...@results_per_page.to_i] results = doc.search('//li.g|//li/div.g')[0...@results_per_page.to_i]


rank_offset = result_offset_of(page_index) rank_offset = result_offset_of(page_index)


results.each_with_index do |result,index| results.each_with_index do |result,index|
rank = rank_offset + (index + 1) rank = rank_offset + (index + 1)
link = result.at('//a.l') link = result.at('//a.l')
title = link.inner_text title = link.inner_text
url = link.get_attribute('href') url = URI(link.get_attribute('href'))
summary_text = '' summary_text = ''
cached_url = nil cached_url = nil
similar_url = nil similar_url = nil


if (content = (result.at('//td.j//font|//td.j/div'))) if (content = (result.at('//div.s|//td.j//font')))
content.children.each do |elem| content.children.each do |elem|
break if (!(elem.text?) && elem.name=='br') break if (!(elem.text?) && elem.name=='br')


summary_text << elem.inner_text summary_text << elem.inner_text
end end


if (cached_link = result.at('nobr/a:first')) if (cached_link = result.at('span.gl/a:first'))
cached_url = cached_link.get_attribute('href') cached_url = URI(cached_link.get_attribute('href'))
end end


if (similar_link = result.at('nobr/a:last')) if (similar_link = result.at('span.gl/a:last'))
similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href') similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
end end
end end


Expand Down Expand Up @@ -445,7 +397,7 @@ def sponsored_links
# top and side ads # top and side ads
doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link| doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
title = link.inner_text title = link.inner_text
url = "http://#{SEARCH_HOST}" + link.get_attribute('href') url = URI("http://#{SEARCH_HOST}" + link.get_attribute('href'))


links << SponsoredAd.new(title,url) links << SponsoredAd.new(title,url)
end end
Expand Down
4 changes: 1 addition & 3 deletions lib/gscraper/sponsored_ad.rb
Expand Up @@ -43,9 +43,7 @@ def initialize(title,url)
# Returns the direct URL of the ad. # Returns the direct URL of the ad.
# #
def direct_url def direct_url
uri = URI(@url) URI(@url.query_params['adurl'] || @url.query_params['q'])

return (uri.query_params['adurl'] || uri.query_params['q'])
end end


# #
Expand Down
2 changes: 1 addition & 1 deletion lib/gscraper/version.rb
Expand Up @@ -21,5 +21,5 @@
# #


module GScraper module GScraper
VERSION = '0.2.0' VERSION = '0.2.1'
end end
3 changes: 1 addition & 2 deletions spec/extensions/uri/http_spec.rb
@@ -1,5 +1,4 @@
require 'pathname' require 'spec_helper'
require Pathname(__FILE__).dirname.join('..','..','spec_helper').expand_path


require 'gscraper/extensions/uri' require 'gscraper/extensions/uri'


Expand Down
3 changes: 1 addition & 2 deletions spec/extensions/uri/query_params_spec.rb
@@ -1,5 +1,4 @@
require 'pathname' require 'spec_helper'
require Pathname(__FILE__).dirname.join('..','..','spec_helper').expand_path


require 'gscraper/extensions/uri' require 'gscraper/extensions/uri'


Expand Down
3 changes: 1 addition & 2 deletions spec/gscraper_spec.rb
@@ -1,5 +1,4 @@
require 'pathname' require 'spec_helper'
require Pathname(__FILE__).dirname.join('spec_helper').expand_path


require 'gscraper/gscraper' require 'gscraper/gscraper'


Expand Down
3 changes: 1 addition & 2 deletions spec/has_pages_examples.rb
@@ -1,5 +1,4 @@
require 'pathname' require 'spec_helper'
require Pathname(__FILE__).dirname.join('spec_helper').expand_path


shared_examples_for "has Pages" do shared_examples_for "has Pages" do


Expand Down
19 changes: 3 additions & 16 deletions spec/has_sponsored_links_examples.rb
@@ -1,5 +1,4 @@
require 'pathname' require 'spec_helper'
require Pathname(__FILE__).dirname.join('spec_helper').expand_path


shared_examples_for "has Sponsored Links" do shared_examples_for "has Sponsored Links" do


Expand All @@ -25,15 +24,9 @@
end end
end end


it "should have non-empty URLs" do
@links.each_url do |url|
url.length.should_not == 0
end
end

it "should have valid URLs" do it "should have valid URLs" do
@links.each_url do |url| @links.each_url do |url|
url_should_be_valid(url) uri_should_be_valid(url)
end end
end end


Expand All @@ -43,15 +36,9 @@
end end
end end


it "should have non-empty direct URLs" do
@links.each_direct_url do |url|
url.length.should_not == 0
end
end

it "should have valid direct URLs" do it "should have valid direct URLs" do
@links.each_direct_url do |url| @links.each_direct_url do |url|
url_should_be_valid(url) uri_should_be_valid(url)
end end
end end


Expand Down
3 changes: 1 addition & 2 deletions spec/helpers/uri.rb
@@ -1,7 +1,6 @@
require 'uri' require 'uri'


def url_should_be_valid(url) def uri_should_be_valid(uri)
uri = URI(url)
uri.scheme.should_not be_nil uri.scheme.should_not be_nil
uri.host.should_not be_nil uri.host.should_not be_nil
uri.path.should_not be_nil uri.path.should_not be_nil
Expand Down
3 changes: 1 addition & 2 deletions spec/page_has_results_examples.rb
@@ -1,5 +1,4 @@
require 'pathname' require 'spec_helper'
require Pathname(__FILE__).dirname.join('spec_helper').expand_path


shared_examples_for "Page has Results" do shared_examples_for "Page has Results" do


Expand Down
9 changes: 4 additions & 5 deletions spec/search/ajax_query_spec.rb
@@ -1,8 +1,7 @@
require 'pathname' require 'spec_helper'
require Pathname(__FILE__).dirname.join('..','spec_helper').expand_path require 'has_pages_examples'
require Pathname(__FILE__).dirname.join('..','has_pages_examples').expand_path require 'page_has_results_examples'
require Pathname(__FILE__).dirname.join('..','page_has_results_examples').expand_path require 'search/page_has_results_examples'
require Pathname(__FILE__).dirname.join('page_has_results_examples').expand_path


require 'gscraper/search/ajax_query' require 'gscraper/search/ajax_query'


Expand Down
17 changes: 3 additions & 14 deletions spec/search/page_has_results_examples.rb
@@ -1,5 +1,4 @@
require 'pathname' require 'spec_helper'
require Pathname(__FILE__).dirname.join('..','spec_helper').expand_path


shared_examples_for "Page has Search Results" do shared_examples_for "Page has Search Results" do


Expand Down Expand Up @@ -29,24 +28,14 @@
end end
end end


it "should have non-empty URLs" do
@page.each_url do |url|
url.length.should_not == 0
end
end

it "should have valid URLs" do it "should have valid URLs" do
@page.each_url do |url| @page.each_url do |url|
url_should_be_valid(url) uri_should_be_valid(url)
end end
end end


it "should have atleast one cached URL" do it "should have atleast one cached URL" do
@page.cached_urls.should_not == 0 @page.cached_urls.length.should_not == 0
end

it "should have atleast one similar query URL" do
@page.similar_urls.should_not == 0
end end


end end
3 changes: 1 addition & 2 deletions spec/search/query_spec.rb
@@ -1,5 +1,4 @@
require 'pathname' require 'spec_helper'
require Pathname(__FILE__).dirname.join('..','spec_helper').expand_path


require 'gscraper/search/query' require 'gscraper/search/query'


Expand Down
15 changes: 9 additions & 6 deletions spec/search/web_query_spec.rb
@@ -1,9 +1,8 @@
require 'pathname' require 'spec_helper'
require Pathname(__FILE__).dirname.join('..','spec_helper').expand_path require 'has_pages_examples'
require Pathname(__FILE__).dirname.join('..','has_pages_examples').expand_path require 'page_has_results_examples'
require Pathname(__FILE__).dirname.join('..','page_has_results_examples').expand_path require 'has_sponsored_links_examples'
require Pathname(__FILE__).dirname.join('..','has_sponsored_links_examples').expand_path require 'search/page_has_results_examples'
require Pathname(__FILE__).dirname.join('page_has_results_examples').expand_path


require 'gscraper/search/web_query' require 'gscraper/search/web_query'


Expand Down Expand Up @@ -72,4 +71,8 @@


end end


it "should have atleast one similar query URL" do
@page.similar_urls.length.should_not == 0
end

end end
5 changes: 2 additions & 3 deletions spec/spec_helper.rb
@@ -1,7 +1,6 @@
require 'rubygems' require 'rubygems'
gem 'rspec', '>=1.1.3' gem 'rspec', '>=1.1.3'
require 'spec' require 'spec'
require 'pathname'


require Pathname(__FILE__).dirname.join('helpers','query').expand_path require 'helpers/query'
require Pathname(__FILE__).dirname.join('helpers','uri').expand_path require 'helpers/uri'
2 changes: 1 addition & 1 deletion tasks/spec.rb
Expand Up @@ -2,6 +2,6 @@


desc "Run all specifications" desc "Run all specifications"
Spec::Rake::SpecTask.new(:spec) do |t| Spec::Rake::SpecTask.new(:spec) do |t|
t.libs += [File.expand_path('lib'), File.expand_path('spec')] t.libs += ['lib', 'spec']
t.spec_opts = ['--colour', '--format', 'specdoc'] t.spec_opts = ['--colour', '--format', 'specdoc']
end end

0 comments on commit fdc6852

Please sign in to comment.