Permalink
Browse files

Update scraper.rb

  • Loading branch information...
chakshurai committed May 30, 2014
1 parent d2d4841 commit b1f530fc4950eb3b068386389dbe71dad05658b4
Showing with 20 additions and 24 deletions.
  1. +20 −24 scraper.rb
View
@@ -1,24 +1,20 @@
# This is a template for a Ruby scraper on Morph (https://morph.io)
# including some code snippets below that you should find helpful
# require 'scraperwiki'
# require 'mechanize'
#
# agent = Mechanize.new
#
# # Read in a page
# page = agent.get("http://foo.com")
#
# # Find somehing on the page using css selectors
# p page.at('div.content')
#
# # Write out to the sqlite database using scraperwiki library
# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"})
#
# # An arbitrary query against the database
# ScraperWiki.select("* from data where 'name'='peter'")
# You don't have to do things with the Mechanize or ScraperWiki libraries. You can use whatever gems are installed
# on Morph for Ruby (https://github.com/openaustralia/morph-docker-ruby/blob/master/Gemfile) and all that matters
# is that your final data is written to an Sqlite database called data.sqlite in the current working directory which
# has at least a table called data.
#!/usr/bin/env python
import scraperwiki
import lxml.html
baselink = "http://pib.nic.in/newsite/erelease.aspx?relid="
for releaseid in range(0,5000):
url = baselink+str(releaseid)
html = scraperwiki.scrape(url)
root = lxml.html.fromstring(html)
date = root.cssselect("div#ministry.mddiv span")
if not date:
continue
data = {
'releasenumber' : releaseid,
'date' : root.cssselect("div#ministry.mddiv span")[0].text_content(),
'ministry' : root.cssselect("div#ministry.mddiv")[0].text,
'title' : root.cssselect("div.contentdiv tr")[0].text_content(),
'content' : lxml.html.tostring(root.cssselect("div.contentdiv")[0])
}
print releaseid
scraperwiki.sql.save(unique_keys=['releasenumber'], data=data)

0 comments on commit b1f530f

Please sign in to comment.