/
scraper.rb
55 lines (46 loc) · 1.57 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/bin/env ruby
# encoding: utf-8
# frozen_string_literal: true
require 'nokogiri'
require 'scraperwiki'
# require 'open-uri/cached'
# OpenURI::Cache.cache_path = '.cache'
require 'scraped_page_archive/open-uri'
def noko_for(url)
Nokogiri::HTML(open(url).read)
end
def scrape_list(term, url)
noko = noko_for(url)
noko.xpath('.//p[contains(., "Speaker")]/ancestor::table//img').each do |img|
info = img.xpath('../following-sibling::p').map { |n| n && n.text.gsub(/[[:space:]]/, ' ').strip }.compact.reject(&:empty?)
# Work around Vice Speaker Martin having an extra level of # nesting
info = img.xpath('../../following-sibling::p').map { |n| n && n.text.strip }.compact if info.size.zero?
info.first.sub! 'Senator ', ''
photo = URI.join(url, URI.escape(img.attr('src'))).to_s
# Ahem
if photo == 'http://www.fsmcongress.fm/images/18th%20Congress/18c%20fsm%20foto/christianweb.2.jpg'
info.unshift 'Peter M. Christian'
end
data = {
term: term,
image: photo,
party: 'none',
}
if info[1].include? 'State of'
data[:name] = info[0]
data[:role] = ''
data[:area] = info[1].sub('State of ', '')
else
data[:name] = info[1]
data[:role] = info[0]
data[:area] = 'n/a'
end
ScraperWiki.save_sqlite(%i(name term), data)
end
end
terms = {
19 => 'http://www.fsmcongress.fm/19th%20Congress/19members.html',
18 => 'http://www.fsmcongress.fm/18th%20Congress/member.html',
17 => 'http://www.fsmcongress.fm/17th%20Congress/members%20page.html',
}
terms.each { |term, url| scrape_list(term, url) }