Skip to content
Permalink
Browse files

Initial scraper

  • Loading branch information...
tmtmtmtm committed Oct 11, 2018
1 parent fcfcaae commit d4d9a0ce381f8ff5ebf2ffa31ee28863149d7957
Showing with 69 additions and 0 deletions.
  1. +69 −0 scraper.rb
@@ -0,0 +1,69 @@
#!/bin/env ruby
# frozen_string_literal: true

require 'json'
require 'rest-client'
require 'scraped'
require 'scraperwiki'

class Results < Scraped::JSON
field :terms do
json[:results][:bindings].map { |result| fragment(result => Term).to_h }
end
end

class Term < Scraped::JSON
field :statement do
json.dig(:ps, :value).to_s.split('/').last
end

field :id do
json.dig(:ordinal, :value).to_i
end

field :name do
json.dig(:itemLabel, :value)
end

field :start_date do
json.dig(:start_date, :value).to_s[0..9]
end

field :end_date do
json.dig(:end_date, :value).to_s[0..9]
end

field :wikidata do
json.dig(:item, :value).to_s.split('/').last
end
end

WIKIDATA_SPARQL_URL = 'https://query.wikidata.org/sparql?format=json&query=%s'

def sparql(query)
result = RestClient.get WIKIDATA_SPARQL_URL, accept: 'text/csv', params: { query: query }
CSV.parse(result, headers: true, header_converters: :symbol)
rescue RestClient::Exception => e
raise "Wikidata query #{query} failed: #{e.message}"
end

query = <<SPARQL
SELECT ?ps ?ordinal ?item ?itemLabel ?start_date ?end_date WHERE {
?item p:P31 ?ps .
?ps ps:P31 wd:Q15238777 ; pq:P642 wd:Q1337463 ; pq:P1545 ?ordinal
OPTIONAL { ?item wdt:P571 ?start_date }
OPTIONAL { ?item wdt:P580 ?start_date }
OPTIONAL { ?item wdt:P576 ?end_date }
OPTIONAL { ?item wdt:P582 ?end_date }
FILTER (BOUND(?start_date))
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
ORDER BY xsd:integer(?ordinal)
SPARQL

url = WIKIDATA_SPARQL_URL % CGI.escape(query)
data = Results.new(response: Scraped::Request.new(url: url).response).terms
puts data.map(&:compact).map(&:sort).map(&:to_h) if ENV['MORPH_DEBUG']

ScraperWiki.sqliteexecute('DROP TABLE data') rescue nil
ScraperWiki.save_sqlite(%i[statement], data)

0 comments on commit d4d9a0c

Please sign in to comment.
You can’t perform that action at this time.