Skip to content

Commit

Permalink
Merge pull request #3 from everypolitician-scrapers/scrape-multiple-t…
Browse files Browse the repository at this point in the history
…erms-tb

Scrape multiple terms
  • Loading branch information
ondenman committed Oct 24, 2016
2 parents b1a794c + 4e27b37 commit 29c1eb6
Show file tree
Hide file tree
Showing 11 changed files with 163 additions and 99 deletions.
1 change: 1 addition & 0 deletions .gitignore
@@ -0,0 +1 @@
data.sqlite
6 changes: 6 additions & 0 deletions .rubocop.yml
@@ -0,0 +1,6 @@
AllCops:
TargetRubyVersion: 2.1

inherit_from:
- https://raw.githubusercontent.com/everypolitician/everypolitician-data/master/.rubocop_base.yml
- .rubocop_todo.yml
Empty file added .rubocop_todo.yml
Empty file.
23 changes: 12 additions & 11 deletions Gemfile
Expand Up @@ -2,17 +2,18 @@
# specified here will be installed and made available to your morph.io scraper.
# Find out more: https://morph.io/documentation/ruby

source "https://rubygems.org"
source 'https://rubygems.org'
git_source(:github) { |repo_name| "https://github.com/#{repo_name}.git" }

ruby "2.0.0"
ruby '2.3.1'

gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
gem "execjs"
gem "pry"
gem "colorize"
gem "nokogiri"
gem "open-uri-cached"
gem "fuzzy_match"
gem 'scraperwiki', git: 'https://github.com/openaustralia/scraperwiki-ruby.git', branch: 'morph_defaults'
gem 'execjs'
gem 'pry'
gem 'colorize'
gem 'nokogiri'
gem 'open-uri-cached'
gem 'fuzzy_match'
gem 'wikidata-client', '~> 0.0.7', require: 'wikidata'


gem 'field_serializer', github: 'everypolitician/field_serializer'
gem 'rubocop'
25 changes: 23 additions & 2 deletions Gemfile.lock
@@ -1,3 +1,9 @@
GIT
remote: https://github.com/everypolitician/field_serializer.git
revision: 5de2b5898b8f56c79f62276362047940312f591a
specs:
field_serializer (0.1.0)

GIT
remote: https://github.com/openaustralia/scraperwiki-ruby.git
revision: fc50176812505e463077d5c673d504a6a234aa78
Expand All @@ -10,6 +16,7 @@ GIT
GEM
remote: https://rubygems.org/
specs:
ast (2.3.0)
coderay (1.1.1)
colorize (0.8.1)
excon (0.52.0)
Expand All @@ -28,15 +35,27 @@ GEM
mini_portile2 (~> 2.1.0)
pkg-config (~> 1.1.7)
open-uri-cached (0.0.5)
parser (2.3.1.4)
ast (~> 2.2)
pkg-config (1.1.7)
powerpack (0.1.1)
pry (0.10.4)
coderay (~> 1.1.0)
method_source (~> 0.8.1)
slop (~> 3.4)
rainbow (2.1.0)
rubocop (0.44.1)
parser (>= 2.3.1.1, < 3.0)
powerpack (~> 0.1)
rainbow (>= 1.99.1, < 3.0)
ruby-progressbar (~> 1.7)
unicode-display_width (~> 1.0, >= 1.0.1)
ruby-progressbar (1.8.1)
slop (3.6.0)
sqlite3 (1.3.11)
sqlite_magic (0.0.6)
sqlite3
unicode-display_width (1.1.1)
wikidata-client (0.0.10)
excon (~> 0.40)
faraday (~> 0.9)
Expand All @@ -49,15 +68,17 @@ PLATFORMS
DEPENDENCIES
colorize
execjs
field_serializer!
fuzzy_match
nokogiri
open-uri-cached
pry
rubocop
scraperwiki!
wikidata-client (~> 0.0.7)

RUBY VERSION
ruby 2.3.0p0
ruby 2.3.1p112

BUNDLED WITH
1.12.5
1.13.5
27 changes: 27 additions & 0 deletions lib/khural_member.rb
@@ -0,0 +1,27 @@
class KhuralMember < NokogiriDocument
field :name do
tds[-4].xpath('.//a').text.strip
end

field :name_mn do
tds[-3].text.strip
end

field :party do
tds[-1].text.strip
end

field :wikiname do
tds[-4].xpath('.//a[not(@class="new")]/@title').text.strip
end

field :constituency do
tds[0].text.strip.gsub("\n", ' — ')
end

private

def tds
@tds ||= noko.css('td')
end
end
17 changes: 17 additions & 0 deletions lib/member_table.rb
@@ -0,0 +1,17 @@
require_relative 'nokogiri_document'
require_relative 'unspanned_table'
require_relative 'khural_member'

class MemberTable < NokogiriDocument
field :members do
table.xpath('.//tr[td]').map do |tr|
KhuralMember.new(tr).to_h
end
end

private

def table
UnspannedTable.new(noko).transformed
end
end
14 changes: 14 additions & 0 deletions lib/nokogiri_document.rb
@@ -0,0 +1,14 @@
require 'field_serializer'
require 'nokogiri'

class NokogiriDocument
include FieldSerializer

def initialize(noko)
@noko = noko
end

private

attr_reader :noko
end
14 changes: 14 additions & 0 deletions lib/term_page.rb
@@ -0,0 +1,14 @@
require_relative 'member_table'
require 'nokogiri'

class TermPage < NokogiriDocument
field :members do
MemberTable.new(table).members
end

private

def table
noko.xpath('.//h2/span[text()[contains(.,"Constituency")]]/following::table[1]')
end
end
37 changes: 37 additions & 0 deletions lib/unspanned_table.rb
@@ -0,0 +1,37 @@
class UnspannedTable
def initialize(noko_table)
@original = noko_table
end

def transformed
@transformed ||= Nokogiri.HTML(
'<table>' +
reparsed.map { |c| '<tr>' + c.map(&:to_html).join + '</tr>' }.join +
'</table>'
)
end

private

attr_reader :original

def reparsed
grid = []

original.css('tr').each_with_index do |row, curr_x|
row.css('td, th').each_with_index do |cell, curr_y|
rowspan = cell.remove_attribute('rowspan').value.to_i rescue 1
colspan = cell.remove_attribute('colspan').value.to_i rescue 1

0.upto(rowspan - 1).each do |x|
0.upto(colspan - 1).each do |y|
curr_y += 1 while (grid[curr_x + x] ||= [])[curr_y + y]
grid[curr_x + x][curr_y + y] = cell
end
end
end
end

grid
end
end
98 changes: 12 additions & 86 deletions scraper.rb
Expand Up @@ -2,95 +2,21 @@
# encoding: utf-8

require 'scraperwiki'
require 'nokogiri'
require 'open-uri/cached'
require_relative 'lib/term_page'

require 'pry'

class Table
def initialize(node)
@table = node
end

def rows
constituency = nil
table.xpath('.//tr[td]').map do |tr|
tds = tr.xpath('./td')
constituency = tds.shift.text.strip.gsub("\n",' — ') if tds.first[:rowspan]
Row.new(tds).to_h.merge(constituency: constituency)
end
end

private

attr_reader :table
end

class Row
def initialize(tds)
@tds = tds
end

def to_h
{
name: name,
name__mn: name_mn,
party: party,
term: term,
wikiname: wikiname,
}
end

private

attr_reader :tds

def name
tds[1].xpath('.//a').text.strip
end

def name_mn
tds[2].text.strip
end

def party
tds[4].text.strip
end

def term
'2016'
end
base_url = 'https://en.wikipedia.org/wiki/'
terms = {
2016 => 'List_of_MPs_elected_in_the_Mongolian_legislative_election,_2016',
2012 => 'List_of_MPs_elected_in_the_Mongolian_legislative_election,_2012',
2008 => 'List_of_MPs_elected_in_the_Mongolian_legislative_election,_2008',
}

def wikiname
tds[1].xpath('.//a[not(@class="new")]/@title').text.strip
terms.each do |term, url|
noko = Nokogiri::HTML(open(base_url + url).read)
TermPage.new(noko).members.each do |mem|
mem[:term] = term
ScraperWiki.save_sqlite(%i(name term), mem)
end
end

class Khurai
def initialize(url)
@url = url
end

def members
Table.new(table).rows
end

private

attr_reader :url

def page
Nokogiri::HTML(open(url).read)
end

def table
page.xpath('.//h2/span[text()[contains(.,"Constituency")]]/following::table[1]')
end
end

url = 'https://en.wikipedia.org/wiki/'\
'List_of_MPs_elected_in_the_Mongolian_legislative_election,_2016'

Khurai.new(url).members.each do |mem|
ScraperWiki.save_sqlite([:name, :term], mem)
end

0 comments on commit 29c1eb6

Please sign in to comment.