From ab078cc47b4cfa4100143653dd1f6731c4698972 Mon Sep 17 00:00:00 2001
From: Tony Bowden <tony@tmtm.com>
Date: Fri, 17 Jul 2015 16:22:52 +0100
Subject: [PATCH] Initial scraper

---
 .gitignore   |  6 ++++++
 Gemfile      | 15 +++++++++++++++
 Gemfile.lock | 42 ++++++++++++++++++++++++++++++++++++++++++
 README.md    |  1 +
 scraper.rb   | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 112 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Gemfile
 create mode 100644 Gemfile.lock
 create mode 100644 README.md
 create mode 100644 scraper.rb

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1c5add4
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+
+.cache/*
+
+*.swp
+
+*.sqlite
diff --git a/Gemfile b/Gemfile
new file mode 100644
index 0000000..cdc1a7f
--- /dev/null
+++ b/Gemfile
@@ -0,0 +1,15 @@
+# It's easy to add more libraries or choose different versions. Any libraries
+# specified here will be installed and made available to your morph.io scraper.
+# Find out more: https://morph.io/documentation/ruby
+
+source "https://rubygems.org"
+
+ruby "2.0.0"
+
+gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
+gem "execjs"
+gem "pry"
+gem "colorize"
+gem "nokogiri"
+gem "open-uri-cached"
+gem "fuzzy_match"
diff --git a/Gemfile.lock b/Gemfile.lock
new file mode 100644
index 0000000..9d17a96
--- /dev/null
+++ b/Gemfile.lock
@@ -0,0 +1,42 @@
+GIT
+  remote: https://github.com/openaustralia/scraperwiki-ruby.git
+  revision: fc50176812505e463077d5c673d504a6a234aa78
+  branch: morph_defaults
+  specs:
+    scraperwiki (3.0.1)
+      httpclient
+      sqlite_magic
+
+GEM
+  remote: https://rubygems.org/
+  specs:
+    coderay (1.1.0)
+    colorize (0.7.7)
+    execjs (2.5.2)
+    fuzzy_match (2.1.0)
+    httpclient (2.6.0.1)
+    method_source (0.8.2)
+    mini_portile (0.6.2)
+    nokogiri (1.6.6.2)
+      mini_portile (~> 0.6.0)
+    open-uri-cached (0.0.5)
+    pry (0.10.1)
+      coderay (~> 1.1.0)
+      method_source (~> 0.8.1)
+      slop (~> 3.4)
+    slop (3.6.0)
+    sqlite3 (1.3.10)
+    sqlite_magic (0.0.3)
+      sqlite3
+
+PLATFORMS
+  ruby
+
+DEPENDENCIES
+  colorize
+  execjs
+  fuzzy_match
+  nokogiri
+  open-uri-cached
+  pry
+  scraperwiki!
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e541894
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation)
\ No newline at end of file
diff --git a/scraper.rb b/scraper.rb
new file mode 100644
index 0000000..50cded5
--- /dev/null
+++ b/scraper.rb
@@ -0,0 +1,48 @@
+#!/bin/env ruby
+# encoding: utf-8
+
+require 'scraperwiki'
+require 'nokogiri'
+require 'open-uri'
+require 'colorize'
+
+require 'pry'
+require 'open-uri/cached'
+OpenURI::Cache.cache_path = '.cache'
+
+def noko_for(url)
+  Nokogiri::HTML(open(url).read) 
+end
+
+def party_info(text)
+  if text =~ /Fiji First/i
+    return [ "Fiji First", "FF" ]
+  elsif text =~ /SODELPA/
+    return [ "Social Democratic Liberal Party" , "SODELPA" ]
+  elsif text =~ /NATIONAL FEDERATION PARTY/
+    return [ "National Federation Party" , "NFP" ]
+  else
+    warn "Unknown party: #{text}"
+  end
+end
+
+def scrape_list(url)
+  noko = noko_for(url)
+
+  noko.xpath('.//td[img]').each do |td|
+    party, party_id = party_info ( td.xpath('preceding::strong[1]').text )
+    data = { 
+      name: td.text.gsub(/[[:space:]]+/, ' ').strip,
+      image: td.css('img/@src').text,
+      party: party,
+      party_id: party_id,
+      term: '2014',
+      source: url,
+    }
+    data[:image] = URI.join(url, data[:image]).to_s unless data[:image].to_s.empty?
+    # puts data
+    ScraperWiki.save_sqlite([:name, :term], data)
+  end
+end
+
+scrape_list('http://www.parliament.gov.fj/Members/Parliamentery-Parties.aspx')