From e547bdbdf19ddb217ab8d11b4527f3b0855eb5e8 Mon Sep 17 00:00:00 2001 From: Tony Bowden Date: Sat, 10 Oct 2015 18:07:58 +0100 Subject: [PATCH] initial scraper --- scraper.rb | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 scraper.rb diff --git a/scraper.rb b/scraper.rb new file mode 100644 index 0000000..34c3242 --- /dev/null +++ b/scraper.rb @@ -0,0 +1,40 @@ +#!/bin/env ruby +# encoding: utf-8 + +require 'scraperwiki' +require 'nokogiri' +require 'colorize' +require 'pry' +require 'open-uri/cached' +OpenURI::Cache.cache_path = '.cache' + +class String + def tidy + self.gsub(/[[:space:]]+/, ' ').strip + end +end + +def noko_for(url) + Nokogiri::HTML(open(url).read) +end + +def scrape_list(url) + noko = noko_for(url) + rows = noko.xpath('//table[.//th[contains(.,"MP")]]//tr[td]') + raise "No rows" if rows.count.zero? + rows.each do |tr| + td = tr.css('td') + data = { + name: td[1].text.tidy, + wikiname: td[1].xpath('.//a[not(@class="new")]/@title').text, + party: td[2].text.tidy, + party_wikiname: td[2].xpath('.//a[not(@class="new")]/@title').text, + area: td[1].xpath('preceding::h3/span[@class="mw-headline"]').last.text, + term: 5, + source: url, + } + ScraperWiki.save_sqlite([:name, :area, :party, :term], data) + end +end + +scrape_list("https://en.wikipedia.org/wiki/List_of_MPs_of_the_National_Assembly_of_Cambodia")