Skip to content

Commit

Permalink
Merge web and CSV data
Browse files Browse the repository at this point in the history
  • Loading branch information
tmtmtmtm committed Jul 30, 2015
1 parent da3f943 commit 16d526c
Showing 1 changed file with 34 additions and 5 deletions.
39 changes: 34 additions & 5 deletions scraper.rb
Expand Up @@ -4,21 +4,50 @@
require 'colorize'
require 'csv'
require 'json'
require 'nokogiri'
require 'scraperwiki'

require 'pry'

def reprocess(file)
def noko_for(url)
Nokogiri::HTML(open(url).read)
end

def reprocess_csv(file)
csv = CSV.table(open(file))
csv.each do |td|
csv.map do |td|
td[:id] = (td.delete :person_id).last
td[:source] = (td.delete :uri).last
td[:name] = "%s %s" % [td[:first_name], td[:last_name]]
td[:sort_name] = "%s, %s" % [td[:last_name], td[:first_name]]
td[:term] = '31'
ScraperWiki.save_sqlite([:id, :term], td)
td.to_hash
end
end

reprocess('https://www.kildarestreet.com/tds/?f=csv')
def scrape_list(url)
noko = noko_for(url)
table = noko.css('table.people')
noko.xpath('.//tr[td]').map do |tr|
tds = tr.css('td')
data = {
image: tds[0].css('img/@src').text,
name: tds[1].css('a').text,
party_id: tds[2].text,
constituency: tds[3].text,
}
data[:id] = data[:image][/(\d+).(jpg|png)$/, 1]
data[:image] = URI.join(url, data[:image]).to_s.sub('/images/','/images/mpsL/') unless data[:image].to_s.empty?
data
end
end

csv_data = reprocess_csv('https://www.kildarestreet.com/tds/?f=csv')
web_data = scrape_list('https://www.kildarestreet.com/tds/')

csv_data.each do |csv_row|
web_row = web_data.find { |r| r[:id].to_s == csv_row[:id].to_s } or binding.pry
# raise "No web match for #{csv_row[:id]}: #{csv_row[:sort_name]}"
data = csv_row.merge(web_row)
ScraperWiki.save_sqlite([:id, :term], data)
end

0 comments on commit 16d526c

Please sign in to comment.