From df096b9a97e95eb891904211ba28d9fedba5859d Mon Sep 17 00:00:00 2001 From: cocharro Date: Fri, 29 Apr 2016 14:12:10 +0100 Subject: [PATCH] Add template for morph.io scraper --- .gitignore | 2 ++ README.md | 1 + requirements.txt | 9 +++++++++ runtime.txt | 1 + scraper.py | 24 ++++++++++++++++++++++++ 5 files changed, 37 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 requirements.txt create mode 100644 runtime.txt create mode 100644 scraper.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..66d464d --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +# Ignore output of scraper +data.sqlite diff --git a/README.md b/README.md new file mode 100644 index 0000000..e541894 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..fce25cc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +# It's easy to add more libraries or choose different versions. Any libraries +# specified here will be installed and made available to your morph.io scraper. +# Find out more: https://morph.io/documentation/python + +# Custom version of scraperwiki library +-e git+http://github.com/openaustralia/scraperwiki-python.git@morph_defaults#egg=scraperwiki + +lxml==3.4.4 +cssselect==0.9.1 diff --git a/runtime.txt b/runtime.txt new file mode 100644 index 0000000..c47075b --- /dev/null +++ b/runtime.txt @@ -0,0 +1 @@ +python-2.7.9 diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..69bea68 --- /dev/null +++ b/scraper.py @@ -0,0 +1,24 @@ +# This is a template for a Python scraper on morph.io (https://morph.io) +# including some code snippets below that you should find helpful + +# import scraperwiki +# import lxml.html +# +# # Read in a page +# html = scraperwiki.scrape("http://foo.com") +# +# # Find something on the page using css selectors +# root = lxml.html.fromstring(html) +# root.cssselect("div[align='left']") +# +# # Write out to the sqlite database using scraperwiki library +# scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"}) +# +# # An arbitrary query against the database +# scraperwiki.sql.select("* from data where 'name'='peter'") + +# You don't have to do things with the ScraperWiki and lxml libraries. +# You can use whatever libraries you want: https://morph.io/documentation/python +# All that matters is that your final data is written to an SQLite database +# called "data.sqlite" in the current working directory which has at least a table +# called "data".