Skip to content

Commit

Permalink
Modernise scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
tmtmtmtm committed Mar 2, 2016
1 parent edfebea commit fd43823
Showing 1 changed file with 5 additions and 30 deletions.
35 changes: 5 additions & 30 deletions scraper.rb
Original file line number Diff line number Diff line change
@@ -1,36 +1,11 @@
#!/bin/env ruby
# encoding: utf-8

require 'scraperwiki'
require 'wikidata/fetcher'
require 'nokogiri'
require 'open-uri'
require 'pry'

def noko_for(url)
Nokogiri::HTML(open(url).read)
end
names = EveryPolitician::Wikidata.wikipedia_xpath(
url: 'https://en.wikipedia.org/wiki/List_of_MPs_of_the_National_Assembly_of_Cambodia',
xpath: '//table[.//th[contains(.,"MP")]]//tr[td]//td[2]//a[not(@class="new")]/@title',
)
EveryPolitician::Wikidata.scrape_wikidata(names: { en: names })

def wikinames_from(url)
noko = noko_for(url)
names = noko.xpath('//table[.//th[contains(.,"MP")]]//tr[td]//td[2]//a[not(@class="new")]/@title').map(&:text)
raise "No names found in #{url}" if names.count.zero?
return names
end

def fetch_info(names)
WikiData.ids_from_pages('en', names.flatten.compact.uniq).each do |name, id|
data = WikiData::Fetcher.new(id: id).data rescue nil
unless data
warn "No data for #{p}"
next
end
data[:original_wikiname] = name
ScraperWiki.save_sqlite([:id], data)
end
end

fetch_info(wikinames_from("https://en.wikipedia.org/wiki/List_of_MPs_of_the_National_Assembly_of_Cambodia"))

require 'rest-client'
warn RestClient.post ENV['MORPH_REBUILDER_URL'], {} if ENV['MORPH_REBUILDER_URL']

0 comments on commit fd43823

Please sign in to comment.