From b70f62f16ceb77813be7ce76a6182f6732b4d0e8 Mon Sep 17 00:00:00 2001 From: Tony Bowden Date: Mon, 4 Apr 2016 14:45:01 +0100 Subject: [PATCH] Extract honorifics --- scraper.rb | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/scraper.rb b/scraper.rb index d720484..6963b35 100644 --- a/scraper.rb +++ b/scraper.rb @@ -14,10 +14,17 @@ def noko_for(url) Nokogiri::HTML(open(url).read) end -def gender_from(name) - return 'female' if name.start_with? 'Mme' - return 'male' if name.start_with? 'M.' - raise "Unknown gender for #{name}" +def gender_from(prefix) + return 'female' if prefix == 'Mme' + return 'male' if prefix == 'M' + raise "Unknown gender for #{prefix}" +end + +def remove_prefixes(name) + return ['Mme', name] if name.sub! /^Mme\.?\s/, '' + return ['M', name] if name.sub! /^M[\. ]+/, '' + return + binding.pry end @@ -25,15 +32,15 @@ def scrape_list(url) noko = noko_for(url) noko.css('#jsn-mainbody table tbody tr').each do |mp| tds = mp.css('td') - name = tds[0].text.gsub(/[[:space:]]+/, ' ').strip - next if name.empty? + prefix, name = remove_prefixes(tds[0].text.gsub(/[[:space:]]+/, ' ').strip) + next if name.to_s.empty? data = { name: name, + honorific_prefix: prefix, party: tds[1].text.strip, area: tds[2].text.strip, - gender: gender_from(name), + gender: gender_from(prefix), term: 2013, - source: url, } # puts data ScraperWiki.save_sqlite([:name, :term], data)