-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraper.rb
executable file
·110 lines (88 loc) · 3.75 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/bin/env ruby
# encoding: utf-8
# frozen_string_literal: true
require 'pry'
require 'scraped'
require 'scraperwiki'
require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'
# require 'scraped_page_archive/open-uri'
# thai_senate URL gives photos and senate numerical IDs
# note "{PAGE-NUMBER}" placemarker will be replaced on the fly
thai_senate_url = 'https://www.senate.go.th/w3c/senate/senator.php?id=18&page={PAGE-NUMBER}&orby=&orrg=ASC'
# Party: currently (2015) the assembly is appointed by the military junta, NCPO
# The senate website has a term_id but it doesn't seem to map to anything
# (because changing it doesn't make any difference); but as this assembly
# was appointed when the coup happened in 2557 BE (Thai calendar), that
# seems like a value to use?
def noko_for(url)
Nokogiri::HTML(open(url).read)
end
def senate_url(url, page_number)
url.sub('{PAGE-NUMBER}', page_number.to_s)
end
# Thai honorifics are found by scraping wikipedia's entry for the senate.
# Can't do this on the official page because there's not always a space, etc.
# Note: return this list in descending honorific length (i.e., longest first).
# This is important because some honorifics may be compounded.
# TODO: move this to another scraper
# Wikipedia has kindly separated out the honorifics
WIKIPEDIA_URL = URI.encode('https://th.wikipedia.org/wiki/สภานิติบัญญัติแห่งชาติ_(ประเทศไทย)_พ.ศ._2557')
def wikipedia_honorifics
noko = noko_for(WIKIPEDIA_URL)
list_section_header = 'การแต่งตั้งรอบแรก'
next_section_header = 'ข่าวเพิ่มเติม'
list_header = noko.xpath('.//span[.="%s"]' % list_section_header)
raise "Can't find #{list_section_header}" if list_header.empty?
list_header.xpath('.//preceding::*').remove
next_header = noko.xpath('.//span[.="%s"]' % next_section_header)
raise "Can't find #{next_section_header}" if next_header.empty?
next_header.xpath('.//following::*').remove
# Whilst we're here also fetch and store all the linked Wikinames
wikinames = noko.xpath('.//ol//li//a[not(@class="new")]/@title').map(&:text).map { |n| {name: n} }
ScraperWiki.save_sqlite([:name], wikinames, 'wikinames')
noko.xpath('.//ol//li[a]').map { |n| n.children.first }.map(&:text).map(&:tidy).uniq
end
def split_honorific_and_name(raw_name, honorifics)
honorific = nil
name = raw_name.tidy.strip
honorifics.each do |hon|
if name.sub!(/^#{hon}/, '')
honorific = hon
break
end
end
[name.tidy, honorific]
end
def scrape_senate_page(url, page_number)
honorifics = wikipedia_honorifics.sort_by(&:length).reverse
url = senate_url(url, page_number)
noko = noko_for(url)
noko.xpath('//div[@id="maincontent"]//table[1]/tr[td]').each do |tr|
tds = tr.css('td')
senate_id = tds[0].text.tidy
next unless senate_id =~ /^\d+$/
raw_name = tds[2].text.tidy
name, honorific = split_honorific_and_name(raw_name, honorifics)
image_url = tds[1].xpath('./img/@src').text.tidy
data = {
id: image_url[/([\d_]+).JPG/, 1].sub(/^_/, ''),
name: name,
image: image_url,
honorific_prefix: honorific,
party: 'NCPO',
term: '2557',
source: url,
}
puts data.reject { |_, v| v.to_s.empty? }.sort_by { |k, _| k }.to_h if ENV['MORPH_DEBUG']
ScraperWiki.save_sqlite([:id], data)
end
end
def number_of_senate_pages(url)
url = senate_url(url, 1)
noko_for(url).xpath("//*[text()[contains(.,'หน้า')]]/a[last()]").text.to_i
end
ScraperWiki.sqliteexecute('DROP TABLE data') rescue nil
(1..number_of_senate_pages(thai_senate_url)).each do |page_number|
scrape_senate_page(thai_senate_url, page_number)
end