From 2c5e35db84365647766ec7aec617e95089ba4e6c Mon Sep 17 00:00:00 2001 From: Tony Bowden Date: Sun, 13 Sep 2015 20:17:50 +0100 Subject: [PATCH] initial scraper --- .gitignore | 6 ++++ Gemfile | 17 +++++++++++ Gemfile.lock | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++ scraper.rb | 29 ++++++++++++++++++ 4 files changed, 135 insertions(+) create mode 100644 .gitignore create mode 100644 Gemfile create mode 100644 Gemfile.lock create mode 100644 scraper.rb diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1c5add4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ + +.cache/* + +*.swp + +*.sqlite diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..5a50e91 --- /dev/null +++ b/Gemfile @@ -0,0 +1,17 @@ +# It's easy to add more libraries or choose different versions. Any libraries +# specified here will be installed and made available to your morph.io scraper. +# Find out more: https://morph.io/documentation/ruby + +source "https://rubygems.org" + +ruby "2.0.0" + +gem "colorize" +gem "mediawiki_api" +gem "nokogiri" +gem "open-uri-cached" +gem "pry" +gem "rest-client" +gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults" +gem "wikidata-fetcher", '>=0.4.0', git: "https://github.com/everypolitician/wikidata-fetcher.git" + diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..26109a5 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,83 @@ +GIT + remote: https://github.com/everypolitician/wikidata-fetcher.git + revision: 914fea8b17b047a143ed3667d650f1fd0e221f1f + specs: + wikidata-fetcher (0.4.0) + colorize + diskcached + mediawiki_api + wikidata-client (~> 0.0.7) + +GIT + remote: https://github.com/openaustralia/scraperwiki-ruby.git + revision: fc50176812505e463077d5c673d504a6a234aa78 + branch: morph_defaults + specs: + scraperwiki (3.0.1) + httpclient + sqlite_magic + +GEM + remote: https://rubygems.org/ + specs: + coderay (1.1.0) + colorize (0.7.7) + diskcached (1.1.2) + domain_name (0.5.24) + unf (>= 0.0.5, < 1.0.0) + excon (0.45.4) + faraday (0.9.1) + multipart-post (>= 1.2, < 3) + faraday-cookie_jar (0.0.6) + faraday (>= 0.7.4) + http-cookie (~> 1.0.0) + faraday_middleware (0.10.0) + faraday (>= 0.7.4, < 0.10) + hashie (3.4.2) + http-cookie (1.0.2) + domain_name (~> 0.5) + httpclient (2.6.0.1) + mediawiki_api (0.4.1) + faraday (~> 0.9, >= 0.9.0) + faraday-cookie_jar (~> 0.0, >= 0.0.6) + method_source (0.8.2) + mime-types (2.6.1) + mini_portile (0.6.2) + multipart-post (2.0.0) + netrc (0.10.3) + nokogiri (1.6.6.2) + mini_portile (~> 0.6.0) + open-uri-cached (0.0.5) + pry (0.10.1) + coderay (~> 1.1.0) + method_source (~> 0.8.1) + slop (~> 3.4) + rest-client (1.8.0) + http-cookie (>= 1.0.2, < 2.0) + mime-types (>= 1.16, < 3.0) + netrc (~> 0.7) + slop (3.6.0) + sqlite3 (1.3.10) + sqlite_magic (0.0.3) + sqlite3 + unf (0.1.4) + unf_ext + unf_ext (0.0.7.1) + wikidata-client (0.0.8) + excon (~> 0.40) + faraday (~> 0.9) + faraday_middleware (~> 0.9) + hashie (~> 3.3) + +PLATFORMS + ruby + +DEPENDENCIES + colorize + mediawiki_api + nokogiri + open-uri-cached + pry + rest-client + scraperwiki! + wikidata-fetcher (>= 0.4.0)! diff --git a/scraper.rb b/scraper.rb new file mode 100644 index 0000000..767aba5 --- /dev/null +++ b/scraper.rb @@ -0,0 +1,29 @@ +#!/bin/env ruby +# encoding: utf-8 + +require 'json' +require 'pry' +require 'rest-client' +require 'scraperwiki' +require 'wikidata/fetcher' +require 'mediawiki_api' + +def members + morph_api_url = 'https://api.morph.io/tmtmtmtm/northern_cyprus_parliament_wikipedia/data.json' + morph_api_key = ENV["MORPH_API_KEY"] + result = RestClient.get morph_api_url, params: { + key: morph_api_key, + query: "select DISTINCT(wikiname) AS wikiname from data" + } + JSON.parse(result, symbolize_names: true) +end + +WikiData.ids_from_pages('en', members.map { |c| c[:wikiname] }).each_with_index do |p, i| + data = WikiData::Fetcher.new(id: p.last).data('tr') rescue nil + unless data + warn "No data for #{p}" + next + end + ScraperWiki.save_sqlite([:id], data) +end +