Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
mhl committed Nov 7, 2017
0 parents commit 7335d1b
Show file tree
Hide file tree
Showing 5 changed files with 181 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
@@ -0,0 +1 @@
/scraperwiki.sqlite
1 change: 1 addition & 0 deletions .ruby-version
@@ -0,0 +1 @@
2.4.2
9 changes: 9 additions & 0 deletions Gemfile
@@ -0,0 +1,9 @@
source 'https://rubygems.org'

git_source(:github) { |repo_name| "https://github.com/#{repo_name}.git" }

gem 'open-uri-cached'
gem 'pry'
gem 'scraped'
gem 'scraperwiki', github: 'openaustralia/scraperwiki-ruby', branch: 'morph_defaults'
gem 'wikidata_ids_decorator', github: 'everypolitician/wikidata_ids_decorator'
105 changes: 105 additions & 0 deletions Gemfile.lock
@@ -0,0 +1,105 @@
GIT
remote: https://github.com/everypolitician/wikidata_ids_decorator.git
revision: 259c6204d9435875c3cf3ba9216b05603b8fbc7c
specs:
wikidata_ids_decorator (0.1.0)
pry
scraped
wikidata-fetcher

GIT
remote: https://github.com/openaustralia/scraperwiki-ruby.git
revision: fc50176812505e463077d5c673d504a6a234aa78
branch: morph_defaults
specs:
scraperwiki (3.0.1)
httpclient
sqlite_magic

GEM
remote: https://rubygems.org/
specs:
coderay (1.1.2)
colorize (0.8.1)
diskcached (1.1.3)
domain_name (0.5.20170404)
unf (>= 0.0.5, < 1.0.0)
excon (0.59.0)
faraday (0.13.1)
multipart-post (>= 1.2, < 3)
faraday-cookie_jar (0.0.6)
faraday (>= 0.7.4)
http-cookie (~> 1.0.0)
faraday_middleware (0.12.2)
faraday (>= 0.7.4, < 1.0)
field_serializer (0.3.0)
hashie (3.5.6)
http-cookie (1.0.3)
domain_name (~> 0.5)
httpclient (2.8.3)
json (2.1.0)
mediawiki_api (0.7.1)
faraday (~> 0.9, >= 0.9.0)
faraday-cookie_jar (~> 0.0, >= 0.0.6)
faraday_middleware (~> 0.10, >= 0.10.0)
method_source (0.9.0)
mime-types (3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2016.0521)
mini_portile2 (2.3.0)
multipart-post (2.0.0)
netrc (0.11.0)
nokogiri (1.8.1)
mini_portile2 (~> 2.3.0)
open-uri-cached (0.0.5)
pry (0.11.2)
coderay (~> 1.1.0)
method_source (~> 0.9.0)
require_all (1.4.0)
rest-client (2.0.2)
http-cookie (>= 1.0.2, < 2.0)
mime-types (>= 1.16, < 4.0)
netrc (~> 0.8)
scraped (0.5.0)
field_serializer (>= 0.3.0)
nokogiri
require_all
sqlite3 (1.3.13)
sqlite_magic (0.0.6)
sqlite3
unf (0.1.4)
unf_ext
unf_ext (0.0.7.4)
wikidata-client (0.0.11)
excon (~> 0.40)
faraday (~> 0.9)
faraday_middleware (~> 0.9)
hashie (~> 3.3)
wikidata-fetcher (0.19.1)
colorize
diskcached
json
mediawiki_api
nokogiri
require_all
rest-client
scraperwiki
wikidata-client (~> 0.0.7)
wikisnakker
wikisnakker (0.9.1)
require_all
yajl-ruby
yajl-ruby (1.3.0)

PLATFORMS
ruby

DEPENDENCIES
open-uri-cached
pry
scraped
scraperwiki!
wikidata_ids_decorator!

BUNDLED WITH
1.16.0
65 changes: 65 additions & 0 deletions scraper.rb
@@ -0,0 +1,65 @@
#!/usr/bin/env ruby

require 'scraperwiki'
require 'scraped'
require 'wikidata_ids_decorator'

require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'
# require 'open-uri'

class CityRow < Scraped::HTML
field :city_name do
city_link.text
end

field :city_wikidata do
city_link.attribute('wikidata').value
end

field :country_name do
city_link.text
end

field :country_wikidata do
country_link.attribute('wikidata').value
end

field :population do
Integer(tds[2].text.sub(/^\s*([0-9,]+).*/m, '\1').tr(',', ''))
end

private

def country_link
tds[0].css('a')
end

def city_link
@city_link ||= noko.css('th').css('a')
end

def tds
@tds ||= noko.css('td')
end

def population_td
tds[2]
end
end

class ListPage < Scraped::HTML
decorator WikidataIdsDecorator::Links

field :members do
noko.xpath('//table[contains(@class, "wikitable")]/tr').drop(2).map do |row|
fragment row => CityRow
end
end
end

url = 'https://en.wikipedia.org/wiki/List_of_largest_cities'
data = ListPage.new(response: Scraped::Request.new(url: url).response).members.map(&:to_h)

ScraperWiki.sqliteexecute('DROP TABLE data') rescue nil
ScraperWiki.save_sqlite([:city_wikidata], data)

0 comments on commit 7335d1b

Please sign in to comment.