From 38f75ffe41f2fa29cf70f5745d1091dbb2754318 Mon Sep 17 00:00:00 2001
From: Tony Bowden <tony@tmtm.com>
Date: Mon, 7 Sep 2015 09:24:23 +0100
Subject: [PATCH] Initial scraper

---
 .gitignore   |  6 ++++++
 Gemfile      | 16 +++++++++++++++
 Gemfile.lock | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 README.md    |  1 +
 scraper.rb   | 38 ++++++++++++++++++++++++++++++++++++
 5 files changed, 116 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Gemfile
 create mode 100644 Gemfile.lock
 create mode 100644 README.md
 create mode 100644 scraper.rb

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1c5add4
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+
+.cache/*
+
+*.swp
+
+*.sqlite
diff --git a/Gemfile b/Gemfile
new file mode 100644
index 0000000..82392a4
--- /dev/null
+++ b/Gemfile
@@ -0,0 +1,16 @@
+# It's easy to add more libraries or choose different versions. Any libraries
+# specified here will be installed and made available to your morph.io scraper.
+# Find out more: https://morph.io/documentation/ruby
+
+source "https://rubygems.org"
+
+ruby "2.0.0"
+
+gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
+gem "execjs"
+gem "pry"
+gem "colorize"
+gem "nokogiri"
+gem "open-uri-cached"
+gem "fuzzy_match"
+gem 'wikidata-client', '~> 0.0.7', require: 'wikidata'
diff --git a/Gemfile.lock b/Gemfile.lock
new file mode 100644
index 0000000..2a80994
--- /dev/null
+++ b/Gemfile.lock
@@ -0,0 +1,55 @@
+GIT
+  remote: https://github.com/openaustralia/scraperwiki-ruby.git
+  revision: fc50176812505e463077d5c673d504a6a234aa78
+  branch: morph_defaults
+  specs:
+    scraperwiki (3.0.1)
+      httpclient
+      sqlite_magic
+
+GEM
+  remote: https://rubygems.org/
+  specs:
+    coderay (1.1.0)
+    colorize (0.7.7)
+    excon (0.45.4)
+    execjs (2.5.2)
+    faraday (0.9.1)
+      multipart-post (>= 1.2, < 3)
+    faraday_middleware (0.10.0)
+      faraday (>= 0.7.4, < 0.10)
+    fuzzy_match (2.1.0)
+    hashie (3.4.2)
+    httpclient (2.6.0.1)
+    method_source (0.8.2)
+    mini_portile (0.6.2)
+    multipart-post (2.0.0)
+    nokogiri (1.6.6.2)
+      mini_portile (~> 0.6.0)
+    open-uri-cached (0.0.5)
+    pry (0.10.1)
+      coderay (~> 1.1.0)
+      method_source (~> 0.8.1)
+      slop (~> 3.4)
+    slop (3.6.0)
+    sqlite3 (1.3.10)
+    sqlite_magic (0.0.3)
+      sqlite3
+    wikidata-client (0.0.7)
+      excon (~> 0.40)
+      faraday (~> 0.9)
+      faraday_middleware (~> 0.9)
+      hashie (~> 3.3)
+
+PLATFORMS
+  ruby
+
+DEPENDENCIES
+  colorize
+  execjs
+  fuzzy_match
+  nokogiri
+  open-uri-cached
+  pry
+  scraperwiki!
+  wikidata-client (~> 0.0.7)
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e541894
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation)
\ No newline at end of file
diff --git a/scraper.rb b/scraper.rb
new file mode 100644
index 0000000..c7f0cdf
--- /dev/null
+++ b/scraper.rb
@@ -0,0 +1,38 @@
+#!/bin/env ruby
+# encoding: utf-8
+
+require 'scraperwiki'
+require 'nokogiri'
+require 'colorize'
+require 'pry'
+require 'open-uri/cached'
+OpenURI::Cache.cache_path = '.cache'
+
+class String
+  def tidy
+    self.gsub(/[[:space:]]+/, ' ').strip
+  end
+end
+
+def noko_for(url)
+  Nokogiri::HTML(open(url).read)
+end
+
+def scrape_list(url)
+  warn url
+  noko = noko_for(url)
+  noko.css('div.ngg-gallery-thumbnail a').each do |a|
+    data = { 
+      id: a.attr('data-image-id'),
+      name: a.attr('data-title'),
+      image: a.attr('data-src'),
+    }
+    ScraperWiki.save_sqlite([:id, :name], data)
+  end
+
+  unless (next_page = noko.css('div.ngg-navigation a.next/@href')).empty?
+    scrape_list(next_page.text) rescue binding.pry
+  end
+end
+
+scrape_list('http://www.parliament.gov.ws/new/members-of-parliament/member/')