Permalink
Browse files

Add Term 48

  • Loading branch information...
tmtmtmtm committed May 31, 2016
1 parent 57b9a1f commit bf8064b7dc0aa481d861cca3d49437737f4cc6ea
Showing with 72 additions and 9 deletions.
  1. +72 −9 scraper.rb
View
@@ -8,6 +8,7 @@
require 'date'
require 'colorize'
require 'date'
require 'pry'
require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'
@@ -19,22 +20,27 @@ def tidy
end
def noko_for(url)
Nokogiri::HTML(open(url).read)
Nokogiri::HTML(open(url).read)
end
def date_from(str)
return if str.to_s.empty?
binding.pry
Date.parse(str).to_s rescue nil
end
def scrape_list(h)
def table_per_party(h)
noko = noko_for(h[:source])
noko.css('#%s' % h[:after]).xpath('.//preceding::*').remove
noko.css('#%s' % h[:before]).xpath('.//following::*').remove
noko.css('h3').each do |section|
party = section.css('.mw-headline').text.gsub(/\(\d+\)/,'').tidy
section.xpath('.//following-sibling::table[1]//tr[.//td[2]]').each do |tr|
td = tr.css('td')
notes = td[4].text.tidy rescue ''
# TODO pick up the start/end dates
data = {
data = {
name: td[1].text.tidy,
wikiname: td[1].xpath('.//a[not(@class="new")]/@title').text,
party: party,
@@ -49,18 +55,75 @@ def scrape_list(h)
end
end
def single_table(h)
noko = noko_for(h[:source])
noko.css('#%s' % h[:after]).xpath('.//preceding::*').remove
noko.css('#%s' % h[:before]).xpath('.//following::*').remove
noko.xpath('.//table//tr[td]').each do |tr|
td = tr.css('td')
data = {
name: td[2].css('vcard').text.tidy,
wikiname: td[2].xpath('.//a[not(@class="new")]/@title').text,
sort_name: td[2].css('.sortkey').text.tidy,
party: td[1].text.tidy,
party_wikiname: td[1].xpath('.//a[not(@class="new")]/@title').text,
area: td[3].text.tidy,
area_wikiname: td[3].xpath('.//a[not(@class="new")]/@title').text,
term: h[:term],
}
data[:area] = 'List' if data[:area].to_s.downcase.include? 'party list'
ScraperWiki.save_sqlite([:name, :party, :term, :area], data)
end
end
scrape_list({
def single_table_changes(h)
noko = noko_for(h[:source])
noko.css('#%s' % h[:after]).xpath('.//preceding::*').remove
noko.css('#%s' % h[:before]).xpath('.//following::*').remove
noko.xpath('.//table//tr[td[2]]').each do |tr|
td = tr.css('td')
# TODO set an end date on the prior MP
data = {
name: td[2].text.tidy,
wikiname: td[2].xpath('.//a[not(@class="new")]/@title').text,
party: td[1].text.tidy,
party_wikiname: td[1].xpath('.//a[not(@class="new")]/@title').text,
start_date: td[3].css('.sortkey').text.sub(/^0*/,'').sub('-0000',''),
area: td[4].text.tidy,
area_wikiname: td[4].xpath('.//a[not(@class="new")]/@title').text,
term: h[:term],
}
next if data[:name].include? '(vacant)'
ScraperWiki.save_sqlite([:name, :party, :term, :area], data)
end
end
table_per_party({
source: 'https://en.wikipedia.org/wiki/50th_New_Zealand_Parliament',
term: 50,
after: 'Members',
before: 'Parliamentary_business',
})
}) if false
scrape_list({
table_per_party({
source: 'https://en.wikipedia.org/wiki/49th_New_Zealand_Parliament',
term: 49,
after: 'Members_of_the_49th_New_Zealand_Parliament',
before: 'By-elections_during_49th_Parliament',
})
}) if false
single_table({
source: 'https://en.wikipedia.org/wiki/48th_New_Zealand_Parliament',
term: 48,
after: 'Members_of_the_48th_Parliament',
before: 'Changes_during_parliamentary_term',
})
single_table_changes({
source: 'https://en.wikipedia.org/wiki/48th_New_Zealand_Parliament',
term: 48,
after: 'Changes_during_parliamentary_term',
before: 'See_also',
})

0 comments on commit bf8064b

Please sign in to comment.