Skip to content

Commit

Permalink
Merge pull request #1 from everypolitician-scrapers/modernise
Browse files Browse the repository at this point in the history
Modernise the scraper
  • Loading branch information
tmtmtmtm committed Jan 8, 2017
2 parents bfd7ad3 + 3a7d7dd commit c21c5f9
Show file tree
Hide file tree
Showing 7 changed files with 182 additions and 42 deletions.
9 changes: 9 additions & 0 deletions .rubocop.yml
@@ -0,0 +1,9 @@
AllCops:
Exclude:
- 'Vagrantfile'
- 'vendor/**/*'
TargetRubyVersion: 2.3

inherit_from:
- https://raw.githubusercontent.com/everypolitician/everypolitician-data/master/.rubocop_base.yml
- .rubocop_todo.yml
21 changes: 21 additions & 0 deletions .rubocop_todo.yml
@@ -0,0 +1,21 @@
# This configuration was generated by
# `rubocop --auto-gen-config`
# on 2017-01-08 10:48:32 +0000 using RuboCop version 0.46.0.
# The point is for the user to remove these configuration records
# one by one as the offenses are removed from the code base.
# Note that changes in the inspected code, or installation of new
# versions of RuboCop, may require this file to be generated again.

# Offense count: 1
Lint/UselessAssignment:
Exclude:
- 'scraper.rb'

# Offense count: 1
Metrics/AbcSize:
Max: 36

# Offense count: 1
# Configuration parameters: CountComments.
Metrics/MethodLength:
Max: 18
5 changes: 5 additions & 0 deletions .travis.yml
@@ -0,0 +1,5 @@
language: ruby
rvm:
- 2.3.3
sudo: false
cache: bundler
32 changes: 21 additions & 11 deletions Gemfile
@@ -1,14 +1,24 @@
# It's easy to add more libraries or choose different versions. Any libraries
# specified here will be installed and made available to your morph.io scraper.
# Find out more: https://morph.io/documentation/ruby
# frozen_string_literal: true

source "https://rubygems.org"
source 'https://rubygems.org'

ruby "2.0.0"
ruby '2.3.3'

gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
gem "execjs"
gem "pry"
gem "colorize"
gem "nokogiri"
gem "open-uri-cached"
git_source(:github) { |repo_name| "https://github.com/#{repo_name}.git" }

gem 'minitest'
gem 'minitest-around'
gem 'minitest-vcr'
gem 'nokogiri'
gem 'open-uri-cached'
gem 'pry'
gem 'rake'
gem 'rest-client'
gem 'rubocop'
gem 'scraped', github: 'everypolitician/scraped'
gem 'scraped_page_archive', github: 'everypolitician/scraped_page_archive'
gem 'scraperwiki', github: 'openaustralia/scraperwiki-ruby',
branch: 'morph_defaults'
gem 'table_unspanner', github: 'everypolitician/table_unspanner'
gem 'vcr'
gem 'webmock'
107 changes: 100 additions & 7 deletions Gemfile.lock
@@ -1,3 +1,27 @@
GIT
remote: https://github.com/everypolitician/scraped.git
revision: c3fc06e1c44beb9027a4dbe4c8bb471d803b27ce
specs:
scraped (0.2.0)
field_serializer (>= 0.3.0)
nokogiri
require_all

GIT
remote: https://github.com/everypolitician/scraped_page_archive.git
revision: 28f93d74b1c11ef01463ad0e7f874050d2e7fc73
specs:
scraped_page_archive (0.5.0)
git (~> 1.3.0)
vcr-archive (~> 0.3.0)

GIT
remote: https://github.com/everypolitician/table_unspanner.git
revision: a70a98a104a75b470f4ea339fdd728366a40b4d8
specs:
table_unspanner (0.1.0)
nokogiri

GIT
remote: https://github.com/openaustralia/scraperwiki-ruby.git
revision: fc50176812505e463077d5c673d504a6a234aa78
Expand All @@ -10,31 +34,100 @@ GIT
GEM
remote: https://rubygems.org/
specs:
addressable (2.5.0)
public_suffix (~> 2.0, >= 2.0.2)
ast (2.3.0)
coderay (1.1.0)
colorize (0.7.7)
execjs (2.5.2)
crack (0.4.3)
safe_yaml (~> 1.0.0)
domain_name (0.5.20161129)
unf (>= 0.0.5, < 1.0.0)
field_serializer (0.3.0)
git (1.3.0)
hashdiff (0.3.1)
http-cookie (1.0.3)
domain_name (~> 0.5)
httpclient (2.6.0.1)
method_source (0.8.2)
mini_portile (0.6.2)
nokogiri (1.6.6.2)
mini_portile (~> 0.6.0)
mime-types (3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2016.0521)
mini_portile2 (2.1.0)
minispec-metadata (2.0.0)
minitest
minitest (5.10.1)
minitest-around (0.4.0)
minitest (~> 5.0)
minitest-vcr (1.4.0)
minispec-metadata (~> 2.0)
minitest (>= 4.7.5)
vcr (>= 2.9)
netrc (0.11.0)
nokogiri (1.7.0.1)
mini_portile2 (~> 2.1.0)
open-uri-cached (0.0.5)
parser (2.3.3.1)
ast (~> 2.2)
powerpack (0.1.1)
pry (0.10.1)
coderay (~> 1.1.0)
method_source (~> 0.8.1)
slop (~> 3.4)
public_suffix (2.0.4)
rainbow (2.1.0)
rake (12.0.0)
require_all (1.3.3)
rest-client (2.0.0)
http-cookie (>= 1.0.2, < 2.0)
mime-types (>= 1.16, < 4.0)
netrc (~> 0.8)
rubocop (0.46.0)
parser (>= 2.3.1.1, < 3.0)
powerpack (~> 0.1)
rainbow (>= 1.99.1, < 3.0)
ruby-progressbar (~> 1.7)
unicode-display_width (~> 1.0, >= 1.0.1)
ruby-progressbar (1.8.1)
safe_yaml (1.0.4)
slop (3.6.0)
sqlite3 (1.3.10)
sqlite_magic (0.0.3)
sqlite3
unf (0.1.4)
unf_ext
unf_ext (0.0.7.2)
unicode-display_width (1.1.2)
vcr (3.0.3)
vcr-archive (0.3.0)
vcr (~> 3.0.2)
webmock (~> 2.0.3)
webmock (2.0.3)
addressable (>= 2.3.6)
crack (>= 0.3.2)
hashdiff

PLATFORMS
ruby

DEPENDENCIES
colorize
execjs
minitest
minitest-around
minitest-vcr
nokogiri
open-uri-cached
pry
rake
rest-client
rubocop
scraped!
scraped_page_archive!
scraperwiki!
table_unspanner!
vcr
webmock

RUBY VERSION
ruby 2.3.3p222

BUNDLED WITH
1.13.6
6 changes: 6 additions & 0 deletions Rakefile
@@ -0,0 +1,6 @@
# frozen_string_literal: true
require 'rubocop/rake_task'

RuboCop::RakeTask.new

task default: %w(rubocop)
44 changes: 20 additions & 24 deletions scraper.rb
@@ -1,32 +1,27 @@
#!/bin/env ruby
# encoding: utf-8
# frozen_string_literal: true

require 'pry'
require 'scraped'
require 'scraperwiki'
require 'nokogiri'
require 'open-uri'

require 'pry'
require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'
# require 'open-uri/cached'
# OpenURI::Cache.cache_path = '.cache'
require 'scraped_page_archive/open-uri'

def noko_for(url)
Nokogiri::HTML(open(url).read)
end

class String
def tidy
self.gsub(/[[:space:]]+/, ' ').strip
end
Nokogiri::HTML(open(url).read)
end

def scrape_list(url)
noko = noko_for(url)
links = noko.css('#block-views-ledamoter-pages-party-block a[href*="ledamoter"]/@href').map(&:text)
raise "No links found" unless links.any?
raise 'No links found' unless links.any?
links.each do |link|
scrape_mp(URI.join(url, link))
end
end
end

def scrape_mp(url)
noko = noko_for(url)
Expand All @@ -36,19 +31,20 @@ def scrape_mp(url)
contact = noko.css('#block-views-ledamot-detaljer-block-1')

named = ->(t) { box.xpath("(.//strong[contains(.,'#{t}')] | .//b[contains(.,'#{t}')])/following-sibling::text()") }
data = {
id: url.to_s.split("/").last,
name: noko.css('h1').text.tidy,
image: noko.css('img[typeof="foaf:Image"]/@src').text.sub(/\?.*/,''),
email: contact.css('a[href*="mailto:"]').text,
phone: contact.css('a[href*="tel:"]').text,
party: om.xpath('.//span[strong[.="Lagtingsgrupp:"]]/following-sibling::span//text()').first.text.tidy,
data = {
id: url.to_s.split('/').last,
name: noko.css('h1').text.tidy,
image: noko.css('img[typeof="foaf:Image"]/@src').text.sub(/\?.*/, ''),
email: contact.css('a[href*="mailto:"]').text,
phone: contact.css('a[href*="tel:"]').text,
party: om.xpath('.//span[strong[.="Lagtingsgrupp:"]]/following-sibling::span//text()').first.text.tidy,
birth_date: om.xpath('.//span[strong[.="Född:"]]/following-sibling::span//text()').first.text.tidy,
term: 2015,
source: url.to_s,
term: 2015,
source: url.to_s,
}
data[:image] = URI.join(url, data[:image]).to_s unless data[:image].to_s.empty?
ScraperWiki.save_sqlite([:id, :term], data)
ScraperWiki.save_sqlite(%i(id term), data)
end

ScraperWiki.sqliteexecute('DELETE FROM data') rescue nil
scrape_list('http://www.lagtinget.ax/ledamoter')

0 comments on commit c21c5f9

Please sign in to comment.