From 38f75ffe41f2fa29cf70f5745d1091dbb2754318 Mon Sep 17 00:00:00 2001 From: Tony Bowden Date: Mon, 7 Sep 2015 09:24:23 +0100 Subject: [PATCH] Initial scraper --- .gitignore | 6 ++++++ Gemfile | 16 +++++++++++++++ Gemfile.lock | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 1 + scraper.rb | 38 ++++++++++++++++++++++++++++++++++++ 5 files changed, 116 insertions(+) create mode 100644 .gitignore create mode 100644 Gemfile create mode 100644 Gemfile.lock create mode 100644 README.md create mode 100644 scraper.rb diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1c5add4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ + +.cache/* + +*.swp + +*.sqlite diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..82392a4 --- /dev/null +++ b/Gemfile @@ -0,0 +1,16 @@ +# It's easy to add more libraries or choose different versions. Any libraries +# specified here will be installed and made available to your morph.io scraper. +# Find out more: https://morph.io/documentation/ruby + +source "https://rubygems.org" + +ruby "2.0.0" + +gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults" +gem "execjs" +gem "pry" +gem "colorize" +gem "nokogiri" +gem "open-uri-cached" +gem "fuzzy_match" +gem 'wikidata-client', '~> 0.0.7', require: 'wikidata' diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..2a80994 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,55 @@ +GIT + remote: https://github.com/openaustralia/scraperwiki-ruby.git + revision: fc50176812505e463077d5c673d504a6a234aa78 + branch: morph_defaults + specs: + scraperwiki (3.0.1) + httpclient + sqlite_magic + +GEM + remote: https://rubygems.org/ + specs: + coderay (1.1.0) + colorize (0.7.7) + excon (0.45.4) + execjs (2.5.2) + faraday (0.9.1) + multipart-post (>= 1.2, < 3) + faraday_middleware (0.10.0) + faraday (>= 0.7.4, < 0.10) + fuzzy_match (2.1.0) + hashie (3.4.2) + httpclient (2.6.0.1) + method_source (0.8.2) + mini_portile (0.6.2) + multipart-post (2.0.0) + nokogiri (1.6.6.2) + mini_portile (~> 0.6.0) + open-uri-cached (0.0.5) + pry (0.10.1) + coderay (~> 1.1.0) + method_source (~> 0.8.1) + slop (~> 3.4) + slop (3.6.0) + sqlite3 (1.3.10) + sqlite_magic (0.0.3) + sqlite3 + wikidata-client (0.0.7) + excon (~> 0.40) + faraday (~> 0.9) + faraday_middleware (~> 0.9) + hashie (~> 3.3) + +PLATFORMS + ruby + +DEPENDENCIES + colorize + execjs + fuzzy_match + nokogiri + open-uri-cached + pry + scraperwiki! + wikidata-client (~> 0.0.7) diff --git a/README.md b/README.md new file mode 100644 index 0000000..e541894 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation) \ No newline at end of file diff --git a/scraper.rb b/scraper.rb new file mode 100644 index 0000000..c7f0cdf --- /dev/null +++ b/scraper.rb @@ -0,0 +1,38 @@ +#!/bin/env ruby +# encoding: utf-8 + +require 'scraperwiki' +require 'nokogiri' +require 'colorize' +require 'pry' +require 'open-uri/cached' +OpenURI::Cache.cache_path = '.cache' + +class String + def tidy + self.gsub(/[[:space:]]+/, ' ').strip + end +end + +def noko_for(url) + Nokogiri::HTML(open(url).read) +end + +def scrape_list(url) + warn url + noko = noko_for(url) + noko.css('div.ngg-gallery-thumbnail a').each do |a| + data = { + id: a.attr('data-image-id'), + name: a.attr('data-title'), + image: a.attr('data-src'), + } + ScraperWiki.save_sqlite([:id, :name], data) + end + + unless (next_page = noko.css('div.ngg-navigation a.next/@href')).empty? + scrape_list(next_page.text) rescue binding.pry + end +end + +scrape_list('http://www.parliament.gov.ws/new/members-of-parliament/member/')