Permalink
Browse files

initial scraper

  • Loading branch information...
tmtmtmtm committed Aug 31, 2015
0 parents commit f1ac6ff0e1baec705803d0d31a13fe017132f018
Showing with 151 additions and 0 deletions.
  1. +6 −0 .gitignore
  2. +16 −0 Gemfile
  3. +55 −0 Gemfile.lock
  4. +1 −0 README.md
  5. +73 −0 scraper.rb
@@ -0,0 +1,6 @@
.cache/*
*.swp
*.sqlite
16 Gemfile
@@ -0,0 +1,16 @@
# It's easy to add more libraries or choose different versions. Any libraries
# specified here will be installed and made available to your morph.io scraper.
# Find out more: https://morph.io/documentation/ruby
source "https://rubygems.org"
ruby "2.0.0"
gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
gem "execjs"
gem "pry"
gem "colorize"
gem "nokogiri"
gem "open-uri-cached"
gem "fuzzy_match"
gem 'wikidata-client', '~> 0.0.7', require: 'wikidata'
@@ -0,0 +1,55 @@
GIT
remote: https://github.com/openaustralia/scraperwiki-ruby.git
revision: fc50176812505e463077d5c673d504a6a234aa78
branch: morph_defaults
specs:
scraperwiki (3.0.1)
httpclient
sqlite_magic
GEM
remote: https://rubygems.org/
specs:
coderay (1.1.0)
colorize (0.7.7)
excon (0.45.4)
execjs (2.5.2)
faraday (0.9.1)
multipart-post (>= 1.2, < 3)
faraday_middleware (0.10.0)
faraday (>= 0.7.4, < 0.10)
fuzzy_match (2.1.0)
hashie (3.4.2)
httpclient (2.6.0.1)
method_source (0.8.2)
mini_portile (0.6.2)
multipart-post (2.0.0)
nokogiri (1.6.6.2)
mini_portile (~> 0.6.0)
open-uri-cached (0.0.5)
pry (0.10.1)
coderay (~> 1.1.0)
method_source (~> 0.8.1)
slop (~> 3.4)
slop (3.6.0)
sqlite3 (1.3.10)
sqlite_magic (0.0.3)
sqlite3
wikidata-client (0.0.7)
excon (~> 0.40)
faraday (~> 0.9)
faraday_middleware (~> 0.9)
hashie (~> 3.3)
PLATFORMS
ruby
DEPENDENCIES
colorize
execjs
fuzzy_match
nokogiri
open-uri-cached
pry
scraperwiki!
wikidata-client (~> 0.0.7)
@@ -0,0 +1 @@
This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation)
@@ -0,0 +1,73 @@
#!/bin/env ruby
# encoding: utf-8
require 'scraperwiki'
require 'nokogiri'
require 'open-uri'
require 'colorize'
require 'pry'
require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'
class String
def tidy
self.gsub(/[[:space:]]+/, ' ').strip
end
end
def noko_for(url)
Nokogiri::HTML(open(url).read)
# Nokogiri::HTML(open(url).read, nil, 'utf-8')
end
def party_from(text)
if text.match(/(.*?)\s+\((.*?)\)/)
return [$1, $2]
else
raise "No party in #{text}"
end
end
def scrape_list(url)
noko = noko_for(url)
box = noko.css('div#TabbedPanels1 table')[1]
box.css('a[href*="candidates/"]/@href').map(&:text).uniq.each do |href|
mp_url = URI.join url, href
scrape_person(mp_url)
end
end
def scrape_person(url)
noko = noko_for(url)
puts url
area = noko.xpath('//td[span[contains(.,"Constituency")]]/following-sibling::td').text
party, party_id = party_from(noko.xpath('//td[span[contains(.,"Party")]]/following-sibling::td').text)
# binding.pry
headline = noko.css('.news_headline')
data = {
id: url.to_s.split('/').last.sub(/\..*/,''),
name: headline.text.tidy,
image: headline.xpath('preceding::img/@src').last.text,
area: noko.xpath('//td[span[contains(.,"Constituency")]]/following-sibling::td').text,
area_id: "ocd-division/country:vc/constituency:%s" % area.downcase.tr(' ','-'),
party: party,
party_id: party_id,
term: 8,
source: url.to_s,
}
data[:image] = URI.join(url, data[:image]).to_s unless data[:image].to_s.empty?
ScraperWiki.save_sqlite([:id, :term], data)
end
term = {
id: 8,
name: "8th Vincentian Assembly",
start_date: 2010,
source: "https://en.wikipedia.org/wiki/House_of_Assembly_of_Saint_Vincent_and_the_Grenadines",
}
ScraperWiki.save_sqlite([:id], term, 'terms')
scrape_list('http://www.caribbeanelections.com/vc/default.asp')

0 comments on commit f1ac6ff

Please sign in to comment.