From ce3988d4c8b628b1d3fa8254eed07ff28bf2aa82 Mon Sep 17 00:00:00 2001 From: Tony Bowden Date: Sun, 8 Jan 2017 10:47:42 +0000 Subject: [PATCH 1/8] Modernise Gemfile --- Gemfile | 32 +++++++++------ Gemfile.lock | 107 +++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 121 insertions(+), 18 deletions(-) diff --git a/Gemfile b/Gemfile index 8aef5ee80..113a9225d 100644 --- a/Gemfile +++ b/Gemfile @@ -1,14 +1,24 @@ -# It's easy to add more libraries or choose different versions. Any libraries -# specified here will be installed and made available to your morph.io scraper. -# Find out more: https://morph.io/documentation/ruby +# frozen_string_literal: true -source "https://rubygems.org" +source 'https://rubygems.org' -ruby "2.0.0" +ruby '2.3.3' -gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults" -gem "execjs" -gem "pry" -gem "colorize" -gem "nokogiri" -gem "open-uri-cached" +git_source(:github) { |repo_name| "https://github.com/#{repo_name}.git" } + +gem 'minitest' +gem 'minitest-around' +gem 'minitest-vcr' +gem 'nokogiri' +gem 'open-uri-cached' +gem 'pry' +gem 'rake' +gem 'rest-client' +gem 'rubocop' +gem 'scraped', github: 'everypolitician/scraped' +gem 'scraped_page_archive', github: 'everypolitician/scraped_page_archive' +gem 'scraperwiki', github: 'openaustralia/scraperwiki-ruby', + branch: 'morph_defaults' +gem 'table_unspanner', github: 'everypolitician/table_unspanner' +gem 'vcr' +gem 'webmock' diff --git a/Gemfile.lock b/Gemfile.lock index 26934250c..d0b84cfb7 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,3 +1,27 @@ +GIT + remote: https://github.com/everypolitician/scraped.git + revision: c3fc06e1c44beb9027a4dbe4c8bb471d803b27ce + specs: + scraped (0.2.0) + field_serializer (>= 0.3.0) + nokogiri + require_all + +GIT + remote: https://github.com/everypolitician/scraped_page_archive.git + revision: 28f93d74b1c11ef01463ad0e7f874050d2e7fc73 + specs: + scraped_page_archive (0.5.0) + git (~> 1.3.0) + vcr-archive (~> 0.3.0) + +GIT + remote: https://github.com/everypolitician/table_unspanner.git + revision: a70a98a104a75b470f4ea339fdd728366a40b4d8 + specs: + table_unspanner (0.1.0) + nokogiri + GIT remote: https://github.com/openaustralia/scraperwiki-ruby.git revision: fc50176812505e463077d5c673d504a6a234aa78 @@ -10,31 +34,100 @@ GIT GEM remote: https://rubygems.org/ specs: + addressable (2.5.0) + public_suffix (~> 2.0, >= 2.0.2) + ast (2.3.0) coderay (1.1.0) - colorize (0.7.7) - execjs (2.5.2) + crack (0.4.3) + safe_yaml (~> 1.0.0) + domain_name (0.5.20161129) + unf (>= 0.0.5, < 1.0.0) + field_serializer (0.3.0) + git (1.3.0) + hashdiff (0.3.1) + http-cookie (1.0.3) + domain_name (~> 0.5) httpclient (2.6.0.1) method_source (0.8.2) - mini_portile (0.6.2) - nokogiri (1.6.6.2) - mini_portile (~> 0.6.0) + mime-types (3.1) + mime-types-data (~> 3.2015) + mime-types-data (3.2016.0521) + mini_portile2 (2.1.0) + minispec-metadata (2.0.0) + minitest + minitest (5.10.1) + minitest-around (0.4.0) + minitest (~> 5.0) + minitest-vcr (1.4.0) + minispec-metadata (~> 2.0) + minitest (>= 4.7.5) + vcr (>= 2.9) + netrc (0.11.0) + nokogiri (1.7.0.1) + mini_portile2 (~> 2.1.0) open-uri-cached (0.0.5) + parser (2.3.3.1) + ast (~> 2.2) + powerpack (0.1.1) pry (0.10.1) coderay (~> 1.1.0) method_source (~> 0.8.1) slop (~> 3.4) + public_suffix (2.0.4) + rainbow (2.1.0) + rake (12.0.0) + require_all (1.3.3) + rest-client (2.0.0) + http-cookie (>= 1.0.2, < 2.0) + mime-types (>= 1.16, < 4.0) + netrc (~> 0.8) + rubocop (0.46.0) + parser (>= 2.3.1.1, < 3.0) + powerpack (~> 0.1) + rainbow (>= 1.99.1, < 3.0) + ruby-progressbar (~> 1.7) + unicode-display_width (~> 1.0, >= 1.0.1) + ruby-progressbar (1.8.1) + safe_yaml (1.0.4) slop (3.6.0) sqlite3 (1.3.10) sqlite_magic (0.0.3) sqlite3 + unf (0.1.4) + unf_ext + unf_ext (0.0.7.2) + unicode-display_width (1.1.2) + vcr (3.0.3) + vcr-archive (0.3.0) + vcr (~> 3.0.2) + webmock (~> 2.0.3) + webmock (2.0.3) + addressable (>= 2.3.6) + crack (>= 0.3.2) + hashdiff PLATFORMS ruby DEPENDENCIES - colorize - execjs + minitest + minitest-around + minitest-vcr nokogiri open-uri-cached pry + rake + rest-client + rubocop + scraped! + scraped_page_archive! scraperwiki! + table_unspanner! + vcr + webmock + +RUBY VERSION + ruby 2.3.3p222 + +BUNDLED WITH + 1.13.6 From fe2b7ef2fea4840c90e715951e3cfb5a4ba524c0 Mon Sep 17 00:00:00 2001 From: Tony Bowden Date: Sun, 8 Jan 2017 10:47:43 +0000 Subject: [PATCH 2/8] Initial rubocop configuration --- .rubocop.yml | 9 +++++++++ .rubocop_todo.yml | 0 2 files changed, 9 insertions(+) create mode 100644 .rubocop.yml create mode 100644 .rubocop_todo.yml diff --git a/.rubocop.yml b/.rubocop.yml new file mode 100644 index 000000000..e520a3d46 --- /dev/null +++ b/.rubocop.yml @@ -0,0 +1,9 @@ +AllCops: + Exclude: + - 'Vagrantfile' + - 'vendor/**/*' + TargetRubyVersion: 2.3 + +inherit_from: + - https://raw.githubusercontent.com/everypolitician/everypolitician-data/master/.rubocop_base.yml + - .rubocop_todo.yml diff --git a/.rubocop_todo.yml b/.rubocop_todo.yml new file mode 100644 index 000000000..e69de29bb From 2844d36f4b1cc18c3932ecd1d652767bf2b674c5 Mon Sep 17 00:00:00 2001 From: Tony Bowden Date: Sun, 8 Jan 2017 10:47:46 +0000 Subject: [PATCH 3/8] rubocop -a --- scraper.rb | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/scraper.rb b/scraper.rb index 523437445..6d7c497fa 100644 --- a/scraper.rb +++ b/scraper.rb @@ -1,5 +1,6 @@ #!/bin/env ruby # encoding: utf-8 +# frozen_string_literal: true require 'scraperwiki' require 'nokogiri' @@ -10,23 +11,23 @@ OpenURI::Cache.cache_path = '.cache' def noko_for(url) - Nokogiri::HTML(open(url).read) + Nokogiri::HTML(open(url).read) end class String def tidy - self.gsub(/[[:space:]]+/, ' ').strip + gsub(/[[:space:]]+/, ' ').strip end end def scrape_list(url) noko = noko_for(url) links = noko.css('#block-views-ledamoter-pages-party-block a[href*="ledamoter"]/@href').map(&:text) - raise "No links found" unless links.any? + raise 'No links found' unless links.any? links.each do |link| scrape_mp(URI.join(url, link)) end - end +end def scrape_mp(url) noko = noko_for(url) @@ -36,19 +37,19 @@ def scrape_mp(url) contact = noko.css('#block-views-ledamot-detaljer-block-1') named = ->(t) { box.xpath("(.//strong[contains(.,'#{t}')] | .//b[contains(.,'#{t}')])/following-sibling::text()") } - data = { - id: url.to_s.split("/").last, - name: noko.css('h1').text.tidy, - image: noko.css('img[typeof="foaf:Image"]/@src').text.sub(/\?.*/,''), - email: contact.css('a[href*="mailto:"]').text, - phone: contact.css('a[href*="tel:"]').text, - party: om.xpath('.//span[strong[.="Lagtingsgrupp:"]]/following-sibling::span//text()').first.text.tidy, + data = { + id: url.to_s.split('/').last, + name: noko.css('h1').text.tidy, + image: noko.css('img[typeof="foaf:Image"]/@src').text.sub(/\?.*/, ''), + email: contact.css('a[href*="mailto:"]').text, + phone: contact.css('a[href*="tel:"]').text, + party: om.xpath('.//span[strong[.="Lagtingsgrupp:"]]/following-sibling::span//text()').first.text.tidy, birth_date: om.xpath('.//span[strong[.="Född:"]]/following-sibling::span//text()').first.text.tidy, - term: 2015, - source: url.to_s, + term: 2015, + source: url.to_s, } data[:image] = URI.join(url, data[:image]).to_s unless data[:image].to_s.empty? - ScraperWiki.save_sqlite([:id, :term], data) + ScraperWiki.save_sqlite(%i(id term), data) end scrape_list('http://www.lagtinget.ax/ledamoter') From 620b5801f99bf8ea9339e7f19b4508ecc5928b08 Mon Sep 17 00:00:00 2001 From: Tony Bowden Date: Sun, 8 Jan 2017 10:48:33 +0000 Subject: [PATCH 4/8] Set up a rubocop TODO file These are based on existing problems, and should be removed over time --- .rubocop_todo.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/.rubocop_todo.yml b/.rubocop_todo.yml index e69de29bb..9735260d8 100644 --- a/.rubocop_todo.yml +++ b/.rubocop_todo.yml @@ -0,0 +1,21 @@ +# This configuration was generated by +# `rubocop --auto-gen-config` +# on 2017-01-08 10:48:32 +0000 using RuboCop version 0.46.0. +# The point is for the user to remove these configuration records +# one by one as the offenses are removed from the code base. +# Note that changes in the inspected code, or installation of new +# versions of RuboCop, may require this file to be generated again. + +# Offense count: 1 +Lint/UselessAssignment: + Exclude: + - 'scraper.rb' + +# Offense count: 1 +Metrics/AbcSize: + Max: 36 + +# Offense count: 1 +# Configuration parameters: CountComments. +Metrics/MethodLength: + Max: 18 From 8f3e3b6b797232bf0e3ffc270589d4abd7005650 Mon Sep 17 00:00:00 2001 From: Tony Bowden Date: Sun, 8 Jan 2017 10:48:33 +0000 Subject: [PATCH 5/8] Add a 'rake rubocop' and configure travis to use it --- .travis.yml | 5 +++++ Rakefile | 6 ++++++ 2 files changed, 11 insertions(+) create mode 100644 .travis.yml create mode 100644 Rakefile diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 000000000..bd98a6c89 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,5 @@ +language: ruby +rvm: + - 2.3.3 +sudo: false +cache: bundler diff --git a/Rakefile b/Rakefile new file mode 100644 index 000000000..f02c220d5 --- /dev/null +++ b/Rakefile @@ -0,0 +1,6 @@ +# frozen_string_literal: true +require 'rubocop/rake_task' + +RuboCop::RakeTask.new + +task default: %w(rubocop) From 20942d45f47e94e92bb9b7c30b98a52b5c4bd128 Mon Sep 17 00:00:00 2001 From: Tony Bowden Date: Sun, 8 Jan 2017 11:21:24 +0000 Subject: [PATCH 6/8] Clean out database on each run --- scraper.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/scraper.rb b/scraper.rb index 6d7c497fa..ef0344d3e 100644 --- a/scraper.rb +++ b/scraper.rb @@ -52,4 +52,5 @@ def scrape_mp(url) ScraperWiki.save_sqlite(%i(id term), data) end +ScraperWiki.sqliteexecute('DELETE FROM data') rescue nil scrape_list('http://www.lagtinget.ax/ledamoter') From a64db9a6b3fa3000e3d13e5109fdad58c05b3a36 Mon Sep 17 00:00:00 2001 From: Tony Bowden Date: Sun, 8 Jan 2017 11:21:40 +0000 Subject: [PATCH 7/8] require Scraped We don't actually _use_ it yet, but it provides us with String.tidy and Nokogiri --- scraper.rb | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/scraper.rb b/scraper.rb index ef0344d3e..acf5cbc35 100644 --- a/scraper.rb +++ b/scraper.rb @@ -2,11 +2,10 @@ # encoding: utf-8 # frozen_string_literal: true +require 'pry' +require 'scraped' require 'scraperwiki' -require 'nokogiri' -require 'open-uri' -require 'pry' require 'open-uri/cached' OpenURI::Cache.cache_path = '.cache' @@ -14,12 +13,6 @@ def noko_for(url) Nokogiri::HTML(open(url).read) end -class String - def tidy - gsub(/[[:space:]]+/, ' ').strip - end -end - def scrape_list(url) noko = noko_for(url) links = noko.css('#block-views-ledamoter-pages-party-block a[href*="ledamoter"]/@href').map(&:text) From 3a7d7dd3f97f651a23f466a01af7ce87f6b9fb82 Mon Sep 17 00:00:00 2001 From: Tony Bowden Date: Sun, 8 Jan 2017 11:21:54 +0000 Subject: [PATCH 8/8] Turn on archiving --- scraper.rb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scraper.rb b/scraper.rb index acf5cbc35..469700cca 100644 --- a/scraper.rb +++ b/scraper.rb @@ -6,8 +6,9 @@ require 'scraped' require 'scraperwiki' -require 'open-uri/cached' -OpenURI::Cache.cache_path = '.cache' +# require 'open-uri/cached' +# OpenURI::Cache.cache_path = '.cache' +require 'scraped_page_archive/open-uri' def noko_for(url) Nokogiri::HTML(open(url).read)