From 20ecd569d9b003a029102369b07b0fc7d2794447 Mon Sep 17 00:00:00 2001 From: Tony Bowden Date: Tue, 31 Jul 2018 09:45:12 +0100 Subject: [PATCH] Initial scraper --- lib/remove_notes.rb | 11 +++++++ lib/unspan_all_tables.rb | 15 ++++++++++ scraper.rb | 62 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+) create mode 100644 lib/remove_notes.rb create mode 100644 lib/unspan_all_tables.rb create mode 100755 scraper.rb diff --git a/lib/remove_notes.rb b/lib/remove_notes.rb new file mode 100644 index 0000000..c193863 --- /dev/null +++ b/lib/remove_notes.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +require 'scraped' + +class RemoveNotes < Scraped::Response::Decorator + def body + Nokogiri::HTML(super).tap do |doc| + doc.css('sup').remove + end.to_s + end +end diff --git a/lib/unspan_all_tables.rb b/lib/unspan_all_tables.rb new file mode 100644 index 0000000..277d93b --- /dev/null +++ b/lib/unspan_all_tables.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +require 'scraped' +require 'table_unspanner' + +class UnspanAllTables < Scraped::Response::Decorator + def body + Nokogiri::HTML(super).tap do |doc| + doc.css('table.wikitable').each do |table| + unspanned_table = TableUnspanner::UnspannedTable.new(table) + table.children = unspanned_table.nokogiri_node.children + end + end.to_s + end +end diff --git a/scraper.rb b/scraper.rb new file mode 100755 index 0000000..fa6a52d --- /dev/null +++ b/scraper.rb @@ -0,0 +1,62 @@ +#!/bin/env ruby +# frozen_string_literal: true + +require 'pry' +require 'scraped' +require 'scraperwiki' +require 'wikidata_ids_decorator' + +require_relative 'lib/remove_notes' +require_relative 'lib/unspan_all_tables' + +require 'open-uri/cached' +OpenURI::Cache.cache_path = '.cache' + +class MembersPage < Scraped::HTML + decorator RemoveNotes + decorator WikidataIdsDecorator::Links + decorator UnspanAllTables + + field :members do + members_table.flat_map do |table| + table.xpath('.//tr[td]').map { |tr| data = fragment(tr => MemberRow).to_h } + end + end + + private + + def members_table + noko.xpath('//table[.//th[contains(.,"Diputado")]]').drop(1) + end +end + +class MemberRow < Scraped::HTML + field :name do + tds[1].css('a').map(&:text).map(&:tidy).first rescue binding.pry + end + + field :id do + tds[1].css('a/@wikidata').map(&:text).first + end + + field :faction do + tds[2].css('a').map(&:text).map(&:tidy).first rescue binding.pry + end + + field :faction_id do + tds[2].css('a/@wikidata').map(&:text).first + end + + field :area do + tds[0].text.tidy + end + + private + + def tds + noko.css('td') + end +end + +url = 'https://es.wikipedia.org/wiki/LV_periodo_legislativo_del_Congreso_Nacional_de_Chile' +Scraped::Scraper.new(url => MembersPage).store(:members, index: %i[name area faction])