From ab078cc47b4cfa4100143653dd1f6731c4698972 Mon Sep 17 00:00:00 2001 From: Tony Bowden Date: Fri, 17 Jul 2015 16:22:52 +0100 Subject: [PATCH] Initial scraper --- .gitignore | 6 ++++++ Gemfile | 15 +++++++++++++++ Gemfile.lock | 42 ++++++++++++++++++++++++++++++++++++++++++ README.md | 1 + scraper.rb | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 112 insertions(+) create mode 100644 .gitignore create mode 100644 Gemfile create mode 100644 Gemfile.lock create mode 100644 README.md create mode 100644 scraper.rb diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1c5add4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ + +.cache/* + +*.swp + +*.sqlite diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..cdc1a7f --- /dev/null +++ b/Gemfile @@ -0,0 +1,15 @@ +# It's easy to add more libraries or choose different versions. Any libraries +# specified here will be installed and made available to your morph.io scraper. +# Find out more: https://morph.io/documentation/ruby + +source "https://rubygems.org" + +ruby "2.0.0" + +gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults" +gem "execjs" +gem "pry" +gem "colorize" +gem "nokogiri" +gem "open-uri-cached" +gem "fuzzy_match" diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..9d17a96 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,42 @@ +GIT + remote: https://github.com/openaustralia/scraperwiki-ruby.git + revision: fc50176812505e463077d5c673d504a6a234aa78 + branch: morph_defaults + specs: + scraperwiki (3.0.1) + httpclient + sqlite_magic + +GEM + remote: https://rubygems.org/ + specs: + coderay (1.1.0) + colorize (0.7.7) + execjs (2.5.2) + fuzzy_match (2.1.0) + httpclient (2.6.0.1) + method_source (0.8.2) + mini_portile (0.6.2) + nokogiri (1.6.6.2) + mini_portile (~> 0.6.0) + open-uri-cached (0.0.5) + pry (0.10.1) + coderay (~> 1.1.0) + method_source (~> 0.8.1) + slop (~> 3.4) + slop (3.6.0) + sqlite3 (1.3.10) + sqlite_magic (0.0.3) + sqlite3 + +PLATFORMS + ruby + +DEPENDENCIES + colorize + execjs + fuzzy_match + nokogiri + open-uri-cached + pry + scraperwiki! diff --git a/README.md b/README.md new file mode 100644 index 0000000..e541894 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation) \ No newline at end of file diff --git a/scraper.rb b/scraper.rb new file mode 100644 index 0000000..50cded5 --- /dev/null +++ b/scraper.rb @@ -0,0 +1,48 @@ +#!/bin/env ruby +# encoding: utf-8 + +require 'scraperwiki' +require 'nokogiri' +require 'open-uri' +require 'colorize' + +require 'pry' +require 'open-uri/cached' +OpenURI::Cache.cache_path = '.cache' + +def noko_for(url) + Nokogiri::HTML(open(url).read) +end + +def party_info(text) + if text =~ /Fiji First/i + return [ "Fiji First", "FF" ] + elsif text =~ /SODELPA/ + return [ "Social Democratic Liberal Party" , "SODELPA" ] + elsif text =~ /NATIONAL FEDERATION PARTY/ + return [ "National Federation Party" , "NFP" ] + else + warn "Unknown party: #{text}" + end +end + +def scrape_list(url) + noko = noko_for(url) + + noko.xpath('.//td[img]').each do |td| + party, party_id = party_info ( td.xpath('preceding::strong[1]').text ) + data = { + name: td.text.gsub(/[[:space:]]+/, ' ').strip, + image: td.css('img/@src').text, + party: party, + party_id: party_id, + term: '2014', + source: url, + } + data[:image] = URI.join(url, data[:image]).to_s unless data[:image].to_s.empty? + # puts data + ScraperWiki.save_sqlite([:name, :term], data) + end +end + +scrape_list('http://www.parliament.gov.fj/Members/Parliamentery-Parties.aspx')