diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1c5add4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ + +.cache/* + +*.swp + +*.sqlite diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..82392a4 --- /dev/null +++ b/Gemfile @@ -0,0 +1,16 @@ +# It's easy to add more libraries or choose different versions. Any libraries +# specified here will be installed and made available to your morph.io scraper. +# Find out more: https://morph.io/documentation/ruby + +source "https://rubygems.org" + +ruby "2.0.0" + +gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults" +gem "execjs" +gem "pry" +gem "colorize" +gem "nokogiri" +gem "open-uri-cached" +gem "fuzzy_match" +gem 'wikidata-client', '~> 0.0.7', require: 'wikidata' diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..2a80994 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,55 @@ +GIT + remote: https://github.com/openaustralia/scraperwiki-ruby.git + revision: fc50176812505e463077d5c673d504a6a234aa78 + branch: morph_defaults + specs: + scraperwiki (3.0.1) + httpclient + sqlite_magic + +GEM + remote: https://rubygems.org/ + specs: + coderay (1.1.0) + colorize (0.7.7) + excon (0.45.4) + execjs (2.5.2) + faraday (0.9.1) + multipart-post (>= 1.2, < 3) + faraday_middleware (0.10.0) + faraday (>= 0.7.4, < 0.10) + fuzzy_match (2.1.0) + hashie (3.4.2) + httpclient (2.6.0.1) + method_source (0.8.2) + mini_portile (0.6.2) + multipart-post (2.0.0) + nokogiri (1.6.6.2) + mini_portile (~> 0.6.0) + open-uri-cached (0.0.5) + pry (0.10.1) + coderay (~> 1.1.0) + method_source (~> 0.8.1) + slop (~> 3.4) + slop (3.6.0) + sqlite3 (1.3.10) + sqlite_magic (0.0.3) + sqlite3 + wikidata-client (0.0.7) + excon (~> 0.40) + faraday (~> 0.9) + faraday_middleware (~> 0.9) + hashie (~> 3.3) + +PLATFORMS + ruby + +DEPENDENCIES + colorize + execjs + fuzzy_match + nokogiri + open-uri-cached + pry + scraperwiki! + wikidata-client (~> 0.0.7) diff --git a/README.md b/README.md new file mode 100644 index 0000000..e541894 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation) \ No newline at end of file diff --git a/scraper.rb b/scraper.rb new file mode 100644 index 0000000..6a84363 --- /dev/null +++ b/scraper.rb @@ -0,0 +1,33 @@ +#!/bin/env ruby +# encoding: utf-8 + +require 'scraperwiki' +require 'nokogiri' +require 'colorize' +require 'pry' +require 'open-uri/cached' +OpenURI::Cache.cache_path = '.cache' + +class String + def tidy + self.gsub(/[[:space:]]+/, ' ').strip + end +end + +def noko_for(url) + Nokogiri::HTML(open(url).read) +end + +def scrape_list(url) + noko = noko_for(url) + noko.css('figure').each do |figure| + data = { + name: figure.text.tidy, + image: figure.css('a/@href').text, + email: figure.xpath('following::strong[1]/following::text()').text.tidy.split(/\s/).find { |t| t.include? '@' }, + } + ScraperWiki.save_sqlite([:name, :image], data) + end +end + +scrape_list('http://www.sainthelena.gov.sh/your-council/')