/
scraper.rb
44 lines (37 loc) · 1.2 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/bin/env ruby
# encoding: utf-8
# frozen_string_literal: true
require 'pry'
require 'scraped'
require 'scraperwiki'
# require 'open-uri/cached'
# OpenURI::Cache.cache_path = '.cache'
require 'scraped_page_archive/open-uri'
def noko_for(url)
Nokogiri::HTML(open(url).read)
end
def scrape_list(url)
noko = noko_for(url)
noko.css('#table-elus tr').drop(1).each do |tr|
tds = tr.css('td')
family_name = tds[0].text.tidy
given_name = tds[1].text.tidy
datatab = noko.css(tds[4].css('a/@href').text)
data = {
id: datatab.css('img/@class').text[/wp-image-(\d+)/, 1],
name: "#{given_name} #{family_name}",
sort_name: "#{family_name}, #{given_name}",
family_name: family_name,
given_name: given_name,
area: tds[2].text.tidy,
party: tds[3].text.tidy,
image: datatab.css('img/@src').text,
term: '4',
source: url,
}
data[:image] = '' if data[:image].include?('no-elu.jpg') || data[:image].include?('img-elus.jpg')
ScraperWiki.save_sqlite(%i(id term), data)
end
end
ScraperWiki.sqliteexecute('DROP TABLE data') rescue nil
scrape_list('http://www.congres.nc/assemblee/les-elus/?panel=5')