From e03d29ad95c097edc30940ab2e3d3182a3cb089b Mon Sep 17 00:00:00 2001 From: Tony Bowden Date: Wed, 9 Sep 2015 11:07:50 +0100 Subject: [PATCH] initial scraper --- .gitignore | 6 ++++++ Gemfile | 16 +++++++++++++++ Gemfile.lock | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 1 + scraper.rb | 48 +++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 126 insertions(+) create mode 100644 .gitignore create mode 100644 Gemfile create mode 100644 Gemfile.lock create mode 100644 README.md create mode 100644 scraper.rb diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1c5add4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ + +.cache/* + +*.swp + +*.sqlite diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..82392a4 --- /dev/null +++ b/Gemfile @@ -0,0 +1,16 @@ +# It's easy to add more libraries or choose different versions. Any libraries +# specified here will be installed and made available to your morph.io scraper. +# Find out more: https://morph.io/documentation/ruby + +source "https://rubygems.org" + +ruby "2.0.0" + +gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults" +gem "execjs" +gem "pry" +gem "colorize" +gem "nokogiri" +gem "open-uri-cached" +gem "fuzzy_match" +gem 'wikidata-client', '~> 0.0.7', require: 'wikidata' diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..2a80994 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,55 @@ +GIT + remote: https://github.com/openaustralia/scraperwiki-ruby.git + revision: fc50176812505e463077d5c673d504a6a234aa78 + branch: morph_defaults + specs: + scraperwiki (3.0.1) + httpclient + sqlite_magic + +GEM + remote: https://rubygems.org/ + specs: + coderay (1.1.0) + colorize (0.7.7) + excon (0.45.4) + execjs (2.5.2) + faraday (0.9.1) + multipart-post (>= 1.2, < 3) + faraday_middleware (0.10.0) + faraday (>= 0.7.4, < 0.10) + fuzzy_match (2.1.0) + hashie (3.4.2) + httpclient (2.6.0.1) + method_source (0.8.2) + mini_portile (0.6.2) + multipart-post (2.0.0) + nokogiri (1.6.6.2) + mini_portile (~> 0.6.0) + open-uri-cached (0.0.5) + pry (0.10.1) + coderay (~> 1.1.0) + method_source (~> 0.8.1) + slop (~> 3.4) + slop (3.6.0) + sqlite3 (1.3.10) + sqlite_magic (0.0.3) + sqlite3 + wikidata-client (0.0.7) + excon (~> 0.40) + faraday (~> 0.9) + faraday_middleware (~> 0.9) + hashie (~> 3.3) + +PLATFORMS + ruby + +DEPENDENCIES + colorize + execjs + fuzzy_match + nokogiri + open-uri-cached + pry + scraperwiki! + wikidata-client (~> 0.0.7) diff --git a/README.md b/README.md new file mode 100644 index 0000000..e541894 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation) \ No newline at end of file diff --git a/scraper.rb b/scraper.rb new file mode 100644 index 0000000..3c5c158 --- /dev/null +++ b/scraper.rb @@ -0,0 +1,48 @@ +#!/bin/env ruby +# encoding: utf-8 + +require 'scraperwiki' +require 'nokogiri' +require 'colorize' +require 'pry' +require 'open-uri/cached' +OpenURI::Cache.cache_path = '.cache' + +class String + def tidy + self.gsub(/[[:space:]]+/, ' ').strip + end +end + +def noko_for(url) + Nokogiri::HTML(open(url).read) +end + +def scrape_list(url) + noko = noko_for(url) + header = noko.xpath('//tr[contains(.,"CONSTITUENCY REPRESENTED")]').last + header.xpath('following-sibling::tr').each do |tr| + tds = tr.css('td') + next if tds.count < 4 + + # Don't need anything extra from this yet... + source = tds[0].css('a/@href').text + next if source.to_s.empty? + source = URI.join(url, source).to_s + + data = { + id: source.split('/').last.split('-').first, + name: tds[0].text.sub('Hon. ','').tidy, + constituency: tds[1].text.tidy, + party: tds[2].text.tidy, + image: tds[3].css('img/@src').text, + term: 2012, + source: source, + } + data[:image] = URI.join(url, data[:image]).to_s unless data[:image].to_s.empty? + + ScraperWiki.save_sqlite([:id, :term], data) + end +end + +scrape_list('http://nationalassembly.gov.bz/index.php/hor-lowerhouse/present-members-house')