Permalink
Browse files

Merge pull request #4 from everypolitician-scrapers/port-to-scraped

Port to Scraped
  • Loading branch information...
tmtmtmtm committed Feb 7, 2017
2 parents e7af078 + 50423c3 commit a42522648d8655765d9aa387f08e8c4c0a00f445
Showing with 128 additions and 88 deletions.
  1. +5 −0 .travis.yml
  2. +14 −5 Gemfile
  3. +57 −14 Gemfile.lock
  4. +6 −0 Rakefile
  5. +46 −69 scraper.rb
View
@@ -0,0 +1,5 @@
language: ruby
rvm:
- 2.3.3
sudo: false
cache: bundler
View
19 Gemfile
@@ -1,15 +1,24 @@
# frozen_string_literal: true
source 'https://rubygems.org'
git_source(:github) { |repo_name| "https://github.com/#{repo_name}.git" }
ruby '2.3.1'
ruby '2.3.3'
git_source(:github) { |repo_name| "https://github.com/#{repo_name}.git" }
gem 'colorize'
gem 'field_serializer', github: 'everypolitician/field_serializer'
gem 'minitest'
gem 'minitest-around'
gem 'minitest-vcr'
gem 'nokogiri'
gem 'open-uri-cached'
gem 'pry'
gem 'rake'
gem 'rest-client'
gem 'rubocop'
gem 'scraped', github: 'everypolitician/scraped'
gem 'scraped_page_archive', github: 'everypolitician/scraped_page_archive'
gem 'scraperwiki', github: 'openaustralia/scraperwiki-ruby', branch: 'morph_defaults'
gem 'scraperwiki', github: 'openaustralia/scraperwiki-ruby',
branch: 'morph_defaults'
gem 'table_unspanner', github: 'everypolitician/table_unspanner'
gem 'vcr'
gem 'webmock'
View
@@ -1,8 +1,11 @@
GIT
remote: https://github.com/everypolitician/field_serializer.git
revision: 16d5d90a3095fa2552cbeeb3fb371b3f409ffd63
remote: https://github.com/everypolitician/scraped.git
revision: c2e9db21e2922d8d8053bf7fa763228a0373db79
specs:
field_serializer (0.1.0)
scraped (0.3.0)
field_serializer (>= 0.3.0)
nokogiri
require_all
GIT
remote: https://github.com/everypolitician/scraped_page_archive.git
@@ -12,6 +15,13 @@ GIT
git (~> 1.3.0)
vcr-archive (~> 0.3.0)
GIT
remote: https://github.com/everypolitician/table_unspanner.git
revision: a70a98a104a75b470f4ea339fdd728366a40b4d8
specs:
table_unspanner (0.1.0)
nokogiri
GIT
remote: https://github.com/openaustralia/scraperwiki-ruby.git
revision: fc50176812505e463077d5c673d504a6a234aa78
@@ -28,18 +38,35 @@ GEM
public_suffix (~> 2.0, >= 2.0.2)
ast (2.3.0)
coderay (1.1.0)
colorize (0.7.7)
crack (0.4.3)
safe_yaml (~> 1.0.0)
domain_name (0.5.20161129)
unf (>= 0.0.5, < 1.0.0)
field_serializer (0.3.0)
git (1.3.0)
hashdiff (0.3.1)
http-cookie (1.0.3)
domain_name (~> 0.5)
httpclient (2.6.0.1)
method_source (0.8.2)
mini_portile (0.6.2)
nokogiri (1.6.6.2)
mini_portile (~> 0.6.0)
mime-types (3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2016.0521)
mini_portile2 (2.1.0)
minispec-metadata (2.0.0)
minitest
minitest (5.10.1)
minitest-around (0.4.0)
minitest (~> 5.0)
minitest-vcr (1.4.0)
minispec-metadata (~> 2.0)
minitest (>= 4.7.5)
vcr (>= 2.9)
netrc (0.11.0)
nokogiri (1.7.0.1)
mini_portile2 (~> 2.1.0)
open-uri-cached (0.0.5)
parser (2.3.1.4)
parser (2.3.3.1)
ast (~> 2.2)
powerpack (0.1.1)
pry (0.10.1)
@@ -48,7 +75,13 @@ GEM
slop (~> 3.4)
public_suffix (2.0.4)
rainbow (2.1.0)
rubocop (0.43.0)
rake (12.0.0)
require_all (1.4.0)
rest-client (2.0.0)
http-cookie (>= 1.0.2, < 2.0)
mime-types (>= 1.16, < 4.0)
netrc (~> 0.8)
rubocop (0.46.0)
parser (>= 2.3.1.1, < 3.0)
powerpack (~> 0.1)
rainbow (>= 1.99.1, < 3.0)
@@ -60,7 +93,10 @@ GEM
sqlite3 (1.3.10)
sqlite_magic (0.0.3)
sqlite3
unicode-display_width (1.1.1)
unf (0.1.4)
unf_ext
unf_ext (0.0.7.2)
unicode-display_width (1.1.2)
vcr (3.0.3)
vcr-archive (0.3.0)
vcr (~> 3.0.2)
@@ -74,17 +110,24 @@ PLATFORMS
ruby
DEPENDENCIES
colorize
field_serializer!
minitest
minitest-around
minitest-vcr
nokogiri
open-uri-cached
pry
rake
rest-client
rubocop
scraped!
scraped_page_archive!
scraperwiki!
table_unspanner!
vcr
webmock
RUBY VERSION
ruby 2.3.1p112
ruby 2.3.3p222
BUNDLED WITH
1.13.4
1.13.6
View
@@ -0,0 +1,6 @@
# frozen_string_literal: true
require 'rubocop/rake_task'
RuboCop::RakeTask.new
task default: %w(rubocop)
View
@@ -2,118 +2,96 @@
# encoding: utf-8
# frozen_string_literal: true
require 'field_serializer'
require 'nokogiri'
require 'open-uri'
require 'pry'
require 'scraped'
require 'scraperwiki'
# require 'open-uri/cached'
# OpenURI::Cache.cache_path = '.cache'
require 'scraped_page_archive/open-uri'
class Page
include FieldSerializer
def initialize(url)
@url = url
end
def noko
@noko ||= Nokogiri::HTML(open(url).read)
end
private
attr_reader :url
def absolute_url(rel)
return if rel.to_s.empty?
URI.join(url, URI.encode(URI.decode(rel)))
end
class FolketingPage < Scraped::HTML
decorator Scraped::Response::Decorator::AbsoluteUrls
end
class PartiesPage < Page
class PartiesPage < FolketingPage
field :parties do
noko.css('.telbogTable tr a[href*="party="]').map do |a|
{
name: a.text,
url: add_pagesize(absolute_url(a.attr('href'))).to_s,
}
fragment a => PartiesPageParty
end
end
end
private
class PartiesPageParty < Scraped::HTML
field :name do
noko.text
end
# We want to add a default '?pagesize=100' to all links
def add_pagesize(uri)
field :url do
# We want to add a default '?pagesize=100'
# TODO: make this a decorator
uri = URI.parse(original_url)
new_args = URI.decode_www_form(uri.query || '') << %w(pagesize 100)
uri.query = URI.encode_www_form(new_args)
uri
uri.to_s
end
private
def original_url
noko.attr('href')
end
end
class PartyPage < Page
class PartyPage < FolketingPage
field :members do
noko.css('.telbogTable').xpath('.//tr[td]').map do |tr|
PartyPageMember.new(tr, url).to_h
fragment tr => PartyPageMember
end
end
end
class PartyPageMember
include FieldSerializer
class PartyPageMember < Scraped::HTML
require 'cgi'
def initialize(row, url)
@row = row
@url = url
end
field :id do
member_url.to_s[%r{/Members/(.*).aspx}, 1]
source.to_s[%r{/Members/(.*).aspx}, 1]
end
field :given_name do
tds[0].text.strip
tds[0].text.tidy
end
field :family_name do
tds[1].text.strip
tds[1].text.tidy
end
field :party do
tds[2].text.strip
tds[2].text.tidy
end
field :party_id do
CGI.parse(URI.parse(url).query)['party'].first.gsub(/[{}]/, '')
end
field :source do
member_url.to_s
noko.at_css('a[href*="/Members/"]/@href').text
end
private
attr_reader :row, :url
def tds
@tds ||= row.css('td')
end
def member_url
URI.join(url, row.at_css('a[href*="/Members/"]/@href').text).to_s
noko.css('td')
end
end
class MemberPage < Page
class MemberPage < FolketingPage
field :name do
box.css('h1').text.strip
box.css('h1').text.tidy
end
field :constituency do
memberships.first[/ in (.*?) from/, 1].sub('greater constituency', '').strip
raw_memberships.first.to_s[/ in (.*?) from/, 1].to_s.sub('greater constituency', '').tidy
end
field :email do
@@ -125,11 +103,11 @@ class MemberPage < Page
end
field :image do
absolute_url(box.css('div.person img/@src').text).to_s
box.css('div.person img/@src').text
end
field :memberships do
memberships.join('+++')
raw_memberships.join('+++')
end
private
@@ -138,25 +116,24 @@ def box
noko.css('#mainform')
end
def memberships
def raw_memberships
box.xpath('.//strong[contains(.,"Member period")]/following-sibling::text()').map(&:text)
end
end
def scrape_party_list(url)
PartiesPage.new(url).to_h[:parties].each do |party|
scrape_party party[:url]
end
def scrape(h)
url, klass = h.to_a.first
klass.new(response: Scraped::Request.new(url: url).response)
end
def scrape_party(url)
ppm = PartyPage.new(url).to_h
start = 'http://www.thedanishparliament.dk/Members/Members_in_party_groups.aspx'
ppm[:members].each do |memrow|
mem = MemberPage.new(memrow[:source])
data = memrow.merge(mem.to_h).merge(term: '2015')
ScraperWiki.sqliteexecute('DELETE FROM data') rescue nil
scrape(start => PartiesPage).parties.each do |party|
scrape(party.url => PartyPage).members.each do |memrow|
mem = scrape(memrow.source => MemberPage)
data = memrow.to_h.merge(mem.to_h).merge(term: '2015')
# puts data.reject { |k, v| v.to_s.empty? }.sort_by { |k, v| k }.to_h
ScraperWiki.save_sqlite(%i(id term), data)
end
end
scrape_party_list('http://www.thedanishparliament.dk/Members/Members_in_party_groups.aspx')

0 comments on commit a425226

Please sign in to comment.