/
scraper.rb
97 lines (77 loc) · 2.22 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/bin/env ruby
# encoding: utf-8
# Kazakhstan Mazhilis scraper
require 'rubygems'
require 'bundler/setup'
require 'pry'
require 'nokogiri'
require 'open-uri/cached'
require 'scraperwiki'
OpenURI::Cache.cache_path = '.cache'
class String
def tidy
self.gsub(/[[:space:]]+/, ' ').strip
end
end
def page_for(url)
Nokogiri::HTML(open(url).read)
end
def index_pages(base_url)
index_pages = []
('a'..'z').to_a.each do |letter|
page = page_for("#{base_url}#{letter}")
index_pages.push(page)
end
index_pages
end
def scrape_index_page_details_for(person)
deputat = {}
deputat[:name] = person.css('div.col-md-8').css('a.links').text.tidy
deputat[:summary] = person.css('div.col-md-8').css('span').text.tidy
bio_page_uri = person.css('div.col-md-8').css('a.links')[0]['href'].sub('/ru','').tidy
deputat[:bio_page_russian] = "#{$base_url}/ru#{bio_page_uri}".tidy
deputat[:bio_page_kazakh] = "#{$base_url}/kk#{bio_page_uri}".tidy
image = person.css('div.col-md-4').css('a.links').css('img').attr('src')
deputat[:image] = "#{$base_url}#{image}"
deputat[:id] = "#{$base_url}/kk#{bio_page_uri}".tidy.split('/').last
deputat
end
def deputats_on(page)
deputats_on_page = page.css('div.deputy-persons')
data = deputats_on_page.map do |person|
scrape_index_page_details_for(person)
end
data
end
def generate_id_from(str)
arr = str.split('')
arr.map! do |i|
i.ord
end
arr.join
end
$base_url = ("http://www.parlam.kz")
pages = index_pages("http://www.parlam.kz/en/mazhilis/People/DeputyList/")
deputats = pages.map { |page| deputats_on(page) }.flatten
deputats = deputats.map do |person|
#get russian bio url
page = page_for(person[:bio_page_russian])
bio = page.css('div.bio')
person[:name__ru] = bio.css('h2').text
person[:summary__ru] = bio.css('p').text
person[:website] = page
person[:website__ru] = page
#get kazakh url
page = page_for(person[:bio_page_kazakh])
bio = page.css('div.bio')
person[:name__kk] = bio.css('h2').text
person[:summary__kk] = bio.css('p').text
person[:website__kk] = page
person[:term] = '6'
person[:source] = 'http://www.parlam.kz/en/mazhilis/General/Deputats'
person[:id] = generate_id_from('#{name}')
person
end
deputats.each do |data|
ScraperWiki.save_sqlite([:id, :name], data)
end