Skip to content

Commit

Permalink
Initial scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
tmtmtmtm committed Sep 11, 2018
1 parent c7082b9 commit cfa2f92
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 0 deletions.
11 changes: 11 additions & 0 deletions lib/remove_notes.rb
@@ -0,0 +1,11 @@
# frozen_string_literal: true

require 'scraped'

class RemoveNotes < Scraped::Response::Decorator
def body
Nokogiri::HTML(super).tap do |doc|
doc.css('sup').remove
end.to_s
end
end
15 changes: 15 additions & 0 deletions lib/unspan_all_tables.rb
@@ -0,0 +1,15 @@
# frozen_string_literal: true

require 'scraped'
require 'table_unspanner'

class UnspanAllTables < Scraped::Response::Decorator
def body
Nokogiri::HTML(super).tap do |doc|
doc.css('table.wikitable').each do |table|
unspanned_table = TableUnspanner::UnspannedTable.new(table)
table.children = unspanned_table.nokogiri_node.children
end
end.to_s
end
end
59 changes: 59 additions & 0 deletions scraper.rb
@@ -0,0 +1,59 @@
#!/bin/env ruby
# frozen_string_literal: true

require 'pry'
require 'scraped'
require 'scraperwiki'
require 'wikidata_ids_decorator'

require_relative 'lib/remove_notes'
require_relative 'lib/unspan_all_tables'

require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'

class MembersPage < Scraped::HTML
decorator WikidataIdsDecorator::Links
decorator RemoveNotes

field :members do
member_items.map { |li| fragment(li => MemberItem).to_h }
end

private

def member_items
noko.xpath('//table[.//th[contains(.,"Lista e Deputeteve")]]//following-sibling::table[1]//ul//li')
end
end

class MemberItem < Scraped::HTML
field :name do
noko.xpath('.//text()').map(&:text).map(&:tidy).first
end

field :id do
noko.xpath('.//a/@wikidata').text
end

field :party do
last_party.text.tidy
end

field :party_id do
last_party.xpath('a/@wikidata').text
end

field :area do
noko.xpath('preceding::b').last.text.gsub('Qarku ', '').tidy
end

private

def last_party
noko.xpath('preceding::p').reject { |p| p.text.empty? }.last
end
end

url = 'https://sq.wikipedia.org/wiki/Kuvendi_i_Shqip%C3%ABris%C3%AB'
Scraped::Scraper.new(url => MembersPage).store(:members, index: %i[name area party])

0 comments on commit cfa2f92

Please sign in to comment.