diff --git a/lib/remove_notes.rb b/lib/remove_notes.rb new file mode 100644 index 0000000..c193863 --- /dev/null +++ b/lib/remove_notes.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +require 'scraped' + +class RemoveNotes < Scraped::Response::Decorator + def body + Nokogiri::HTML(super).tap do |doc| + doc.css('sup').remove + end.to_s + end +end diff --git a/lib/unspan_all_tables.rb b/lib/unspan_all_tables.rb new file mode 100644 index 0000000..277d93b --- /dev/null +++ b/lib/unspan_all_tables.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +require 'scraped' +require 'table_unspanner' + +class UnspanAllTables < Scraped::Response::Decorator + def body + Nokogiri::HTML(super).tap do |doc| + doc.css('table.wikitable').each do |table| + unspanned_table = TableUnspanner::UnspannedTable.new(table) + table.children = unspanned_table.nokogiri_node.children + end + end.to_s + end +end diff --git a/scraper.rb b/scraper.rb new file mode 100755 index 0000000..7d9a8d3 --- /dev/null +++ b/scraper.rb @@ -0,0 +1,59 @@ +#!/bin/env ruby +# frozen_string_literal: true + +require 'pry' +require 'scraped' +require 'scraperwiki' +require 'wikidata_ids_decorator' + +require_relative 'lib/remove_notes' +require_relative 'lib/unspan_all_tables' + +require 'open-uri/cached' +OpenURI::Cache.cache_path = '.cache' + +class MembersPage < Scraped::HTML + decorator WikidataIdsDecorator::Links + decorator RemoveNotes + + field :members do + member_items.map { |li| fragment(li => MemberItem).to_h } + end + + private + + def member_items + noko.xpath('//table[.//th[contains(.,"Lista e Deputeteve")]]//following-sibling::table[1]//ul//li') + end +end + +class MemberItem < Scraped::HTML + field :name do + noko.xpath('.//text()').map(&:text).map(&:tidy).first + end + + field :id do + noko.xpath('.//a/@wikidata').text + end + + field :party do + last_party.text.tidy + end + + field :party_id do + last_party.xpath('a/@wikidata').text + end + + field :area do + noko.xpath('preceding::b').last.text.gsub('Qarku ', '').tidy + end + + private + + def last_party + noko.xpath('preceding::p').reject { |p| p.text.empty? }.last + end +end + +url = 'https://sq.wikipedia.org/wiki/Kuvendi_i_Shqip%C3%ABris%C3%AB' +Scraped::Scraper.new(url => MembersPage).store(:members, index: %i[name area party])