Permalink
Browse files

Modernise scraper

  • Loading branch information...
tmtmtmtm committed Mar 2, 2016
1 parent edfebea commit fd438236ad9a7c5da963b7e0182bc3796ea89ba6
Showing with 5 additions and 30 deletions.
  1. +5 −30 scraper.rb
View
@@ -1,36 +1,11 @@
#!/bin/env ruby
# encoding: utf-8
require 'scraperwiki'
require 'wikidata/fetcher'
require 'nokogiri'
require 'open-uri'
require 'pry'
def noko_for(url)
Nokogiri::HTML(open(url).read)
end
names = EveryPolitician::Wikidata.wikipedia_xpath(
url: 'https://en.wikipedia.org/wiki/List_of_MPs_of_the_National_Assembly_of_Cambodia',
xpath: '//table[.//th[contains(.,"MP")]]//tr[td]//td[2]//a[not(@class="new")]/@title',
)
EveryPolitician::Wikidata.scrape_wikidata(names: { en: names })
def wikinames_from(url)
noko = noko_for(url)
names = noko.xpath('//table[.//th[contains(.,"MP")]]//tr[td]//td[2]//a[not(@class="new")]/@title').map(&:text)
raise "No names found in #{url}" if names.count.zero?
return names
end
def fetch_info(names)
WikiData.ids_from_pages('en', names.flatten.compact.uniq).each do |name, id|
data = WikiData::Fetcher.new(id: id).data rescue nil
unless data
warn "No data for #{p}"
next
end
data[:original_wikiname] = name
ScraperWiki.save_sqlite([:id], data)
end
end
fetch_info(wikinames_from("https://en.wikipedia.org/wiki/List_of_MPs_of_the_National_Assembly_of_Cambodia"))
require 'rest-client'
warn RestClient.post ENV['MORPH_REBUILDER_URL'], {} if ENV['MORPH_REBUILDER_URL']

0 comments on commit fd43823

Please sign in to comment.