Skip to content

Commit

Permalink
Rewrite using SPARQL
Browse files Browse the repository at this point in the history
  • Loading branch information
tmtmtmtm committed Oct 17, 2018
1 parent 1456874 commit e8fa830
Show file tree
Hide file tree
Showing 6 changed files with 157 additions and 147 deletions.
60 changes: 8 additions & 52 deletions .rubocop.yml
@@ -1,52 +1,8 @@
Style/AlignHash:
EnforcedHashRocketStyle: table
EnforcedColonStyle: table

Style/ClassAndModuleCamelCase:
Enabled: false

Style/CollectionMethods:
Enabled: true

Style/Documentation:
Enabled: false

Style/FormatString:
EnforcedStyle: percent

Style/HashSyntax:
EnforcedStyle: ruby19_no_mixed_keys

Style/RescueModifier:
Enabled: false

Style/SignalException:
Enabled: false

Style/SymbolArray:
Enabled: true

Style/TrailingCommaInLiteral:
EnforcedStyleForMultiline: consistent_comma


Metrics/ClassLength:
Max: 250

Metrics/MethodLength:
Max: 20

Metrics/LineLength:
Max: 150

Metrics/AbcSize:
Max: 50

Metrics/CyclomaticComplexity:
Max: 7

Metrics/PerceivedComplexity:
Max: 8

# Get rid of these ones over time
inherit_from: .rubocop_todo.yml
AllCops:
Exclude:
- 'Vagrantfile'
- 'vendor/**/*'
TargetRubyVersion: 2.4

inherit_from:
- https://raw.githubusercontent.com/everypolitician/everypolitician-data/master/.rubocop_base.yml
20 changes: 14 additions & 6 deletions Gemfile
@@ -1,15 +1,23 @@
# frozen_string_literal: true

# It's easy to add more libraries or choose different versions. Any libraries
# specified here will be installed and made available to your morph.io scraper.
# Find out more: https://morph.io/documentation/ruby

ruby '2.4.4'

source 'https://rubygems.org'
git_source(:github) { |repo_name| "https://github.com/#{repo_name}.git" }

ruby '2.3.3'

gem 'everypolitician', github: 'everypolitician/everypolitician-ruby'
gem 'pry'
gem 'rubocop'
gem 'rest-client'
gem 'scraped', github: 'everypolitician/scraped', branch: 'scraper-class'
gem 'scraperwiki', github: 'openaustralia/scraperwiki-ruby', branch: 'morph_defaults'
gem 'wikisnakker', github: 'everypolitician/wikisnakker'
gem 'sqlite_magic', github: 'openc/sqlite_magic'

group :quality do
gem 'rubocop'
end

group :development do
gem 'pry'
end
98 changes: 58 additions & 40 deletions Gemfile.lock
@@ -1,19 +1,13 @@
GIT
remote: https://github.com/everypolitician/everypolitician-ruby.git
revision: 20582b1512358a0e8789ea289201e37ab5f085c2
remote: https://github.com/everypolitician/scraped.git
revision: ecb23adeca95fba5356509d6445d528e212b3905
branch: scraper-class
specs:
everypolitician (0.20.0)
everypolitician-popolo (>= 0.8.0)
scraped (0.6.2)
field_serializer (>= 0.3.0)
nokogiri
require_all

GIT
remote: https://github.com/everypolitician/wikisnakker.git
revision: 4e091cdc9619b6c12db8903075effef361071132
specs:
wikisnakker (0.9.1)
require_all
yajl-ruby

GIT
remote: https://github.com/openaustralia/scraperwiki-ruby.git
revision: fc50176812505e463077d5c673d504a6a234aa78
Expand All @@ -23,50 +17,74 @@ GIT
httpclient
sqlite_magic

GIT
remote: https://github.com/openc/sqlite_magic.git
revision: 4df975eb4e9891de54f870077c83f63762af9bf9
specs:
sqlite_magic (0.0.6)
sqlite3

GEM
remote: https://rubygems.org/
specs:
ast (2.3.0)
coderay (1.1.0)
everypolitician-popolo (0.8.0)
require_all
httpclient (2.6.0.1)
method_source (0.8.2)
parser (2.3.1.2)
ast (~> 2.2)
powerpack (0.1.1)
pry (0.10.1)
ast (2.4.0)
coderay (1.1.2)
domain_name (0.5.20180417)
unf (>= 0.0.5, < 1.0.0)
field_serializer (0.3.0)
http-cookie (1.0.3)
domain_name (~> 0.5)
httpclient (2.8.3)
jaro_winkler (1.5.1)
method_source (0.9.0)
mime-types (3.2.2)
mime-types-data (~> 3.2015)
mime-types-data (3.2018.0812)
mini_portile2 (2.3.0)
netrc (0.11.0)
nokogiri (1.8.5)
mini_portile2 (~> 2.3.0)
parallel (1.12.1)
parser (2.5.1.2)
ast (~> 2.4.0)
powerpack (0.1.2)
pry (0.11.3)
coderay (~> 1.1.0)
method_source (~> 0.8.1)
slop (~> 3.4)
rainbow (2.1.0)
require_all (1.4.0)
rubocop (0.42.0)
parser (>= 2.3.1.1, < 3.0)
method_source (~> 0.9.0)
rainbow (3.0.0)
require_all (2.0.0)
rest-client (2.0.2)
http-cookie (>= 1.0.2, < 2.0)
mime-types (>= 1.16, < 4.0)
netrc (~> 0.8)
rubocop (0.59.2)
jaro_winkler (~> 1.5.1)
parallel (~> 1.10)
parser (>= 2.5, != 2.5.1.1)
powerpack (~> 0.1)
rainbow (>= 1.99.1, < 3.0)
rainbow (>= 2.2.2, < 4.0)
ruby-progressbar (~> 1.7)
unicode-display_width (~> 1.0, >= 1.0.1)
ruby-progressbar (1.8.1)
slop (3.6.0)
sqlite3 (1.3.10)
sqlite_magic (0.0.3)
sqlite3
unicode-display_width (1.1.1)
yajl-ruby (1.3.0)
ruby-progressbar (1.10.0)
sqlite3 (1.3.13)
unf (0.1.4)
unf_ext
unf_ext (0.0.7.5)
unicode-display_width (1.4.0)

PLATFORMS
ruby

DEPENDENCIES
everypolitician!
pry
rest-client
rubocop
scraped!
scraperwiki!
wikisnakker!
sqlite_magic!

RUBY VERSION
ruby 2.3.3p222
ruby 2.4.4p296

BUNDLED WITH
1.13.6
1.16.5
74 changes: 74 additions & 0 deletions lib/cabinet.rb
@@ -0,0 +1,74 @@
# TODO: extending Scraped::Scraper with ability to add Strategies
class Scraped::Request::Strategy::LiveRequest
require 'rest-client'

def url
SPARQL_URL % CGI.escape(QUERY % @url)
end

private

def sparql(query)
result = RestClient.get WIKIDATA_SPARQL_URL, accept: 'text/csv', params: { query: query }
CSV.parse(result, headers: true, header_converters: :symbol)
rescue RestClient::Exception => e
raise "Wikidata query #{query} failed: #{e.message}"
end

SPARQL_URL = 'https://query.wikidata.org/sparql?format=json&query=%s'

QUERY = <<~SPARQL
SELECT DISTINCT ?ps ?item ?itemLabel ?minister ?ministerLabel ?ordinal ?start ?end ?cabinet ?cabinetLabel
WHERE {
?item p:P39/ps:P39/wdt:P279* wd:%s .
?item p:P39 ?ps .
?ps ps:P39 ?minister .
?minister wdt:P279* wd:Q83307 .
OPTIONAL { ?ps pq:P1545 ?ordinal }
OPTIONAL { ?ps pq:P580 ?start }
OPTIONAL { ?ps pq:P582 ?end }
OPTIONAL { ?ps pq:P5054 ?cabinet }
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
SPARQL
end

class CabinetScraper < Scraped::JSON
field :memberships do
json[:results][:bindings].map { |result| fragment(result => Membership).to_h }
end

class Membership < Scraped::JSON
field :id do
json.dig(:item, :value).to_s.split('/').last
end

field :name do
json.dig(:itemLabel, :value)
end

field :position_id do
json.dig(:ps, :value).to_s.split('/').last
end

field :position do
json.dig(:minister, :value).to_s.split('/').last
end

field :label do
json.dig(:ministerLabel, :value)
end

field :start_date do
json.dig(:start, :value).to_s[0..9]
end

field :end_date do
json.dig(:end, :value).to_s[0..9]
end

field :ordinal do
json.dig(:ordinal, :value).to_i
end
end
end
38 changes: 0 additions & 38 deletions lib/politician.rb

This file was deleted.

14 changes: 3 additions & 11 deletions scraper.rb
@@ -1,16 +1,8 @@
#!/bin/env ruby
# encoding: utf-8
# frozen_string_literal: true

require 'everypolitician'
require 'pry'
require 'scraped'
require 'scraperwiki'
require_relative 'lib/cabinet'

require_relative 'lib/politician'

ScraperWiki.sqliteexecute('DROP TABLE data') rescue nil
house = EveryPolitician::Index.new.country('Brazil').lower_house
house.popolo.persons.map(&:wikidata).compact.each_slice(100) do |wanted|
data = Wikisnakker::Politician.find(wanted).flat_map(&:positions).compact
ScraperWiki.save_sqlite(%i(id position start_date), data)
end
Scraped::Scraper.new('Q20058725' => CabinetScraper).store(:memberships, index: %i[position_id])

0 comments on commit e8fa830

Please sign in to comment.