Skip to content

Commit

Permalink
Initial scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
tmtmtmtm committed Jul 31, 2018
1 parent c9a3c5e commit 20ecd56
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 0 deletions.
11 changes: 11 additions & 0 deletions lib/remove_notes.rb
@@ -0,0 +1,11 @@
# frozen_string_literal: true

require 'scraped'

class RemoveNotes < Scraped::Response::Decorator
def body
Nokogiri::HTML(super).tap do |doc|
doc.css('sup').remove
end.to_s
end
end
15 changes: 15 additions & 0 deletions lib/unspan_all_tables.rb
@@ -0,0 +1,15 @@
# frozen_string_literal: true

require 'scraped'
require 'table_unspanner'

class UnspanAllTables < Scraped::Response::Decorator
def body
Nokogiri::HTML(super).tap do |doc|
doc.css('table.wikitable').each do |table|
unspanned_table = TableUnspanner::UnspannedTable.new(table)
table.children = unspanned_table.nokogiri_node.children
end
end.to_s
end
end
62 changes: 62 additions & 0 deletions scraper.rb
@@ -0,0 +1,62 @@
#!/bin/env ruby
# frozen_string_literal: true

require 'pry'
require 'scraped'
require 'scraperwiki'
require 'wikidata_ids_decorator'

require_relative 'lib/remove_notes'
require_relative 'lib/unspan_all_tables'

require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'

class MembersPage < Scraped::HTML
decorator RemoveNotes
decorator WikidataIdsDecorator::Links
decorator UnspanAllTables

field :members do
members_table.flat_map do |table|
table.xpath('.//tr[td]').map { |tr| data = fragment(tr => MemberRow).to_h }
end
end

private

def members_table
noko.xpath('//table[.//th[contains(.,"Diputado")]]').drop(1)
end
end

class MemberRow < Scraped::HTML
field :name do
tds[1].css('a').map(&:text).map(&:tidy).first rescue binding.pry
end

field :id do
tds[1].css('a/@wikidata').map(&:text).first
end

field :faction do
tds[2].css('a').map(&:text).map(&:tidy).first rescue binding.pry
end

field :faction_id do
tds[2].css('a/@wikidata').map(&:text).first
end

field :area do
tds[0].text.tidy
end

private

def tds
noko.css('td')
end
end

url = 'https://es.wikipedia.org/wiki/LV_periodo_legislativo_del_Congreso_Nacional_de_Chile'
Scraped::Scraper.new(url => MembersPage).store(:members, index: %i[name area faction])

0 comments on commit 20ecd56

Please sign in to comment.