diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..66d464d --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +# Ignore output of scraper +data.sqlite diff --git a/README.textile b/README.textile new file mode 100644 index 0000000..9003bf7 --- /dev/null +++ b/README.textile @@ -0,0 +1 @@ +Start here: check the ScraperWiki interface is working, then learn how to download a web page. \ No newline at end of file diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..a69940c --- /dev/null +++ b/scraper.py @@ -0,0 +1,33 @@ +############################################################################### +# START HERE: Tutorial 1: Getting used to the ScraperWiki editing interface. +# Follow the actions listed with -- BLOCK CAPITALS below. +############################################################################### + +# ----------------------------------------------------------------------------- +# 1. Start by running a really simple Python script, just to make sure that +# everything is working OK. +# -- CLICK THE 'RUN' BUTTON BELOW +# You should see some numbers print in the 'Console' tab below. If it doesn't work, +# try reopening this page in a different browser - Chrome or the latest Firefox. +# ----------------------------------------------------------------------------- + +for i in range(10): + print "Hello", i + +# ----------------------------------------------------------------------------- +# 2. Next, try scraping an actual web page and getting some raw HTML. +# -- UNCOMMENT THE THREE LINES BELOW (i.e. delete the # at the start of the lines) +# -- CLICK THE 'RUN' BUTTON AGAIN +# You should see the raw HTML at the bottom of the 'Console' tab. +# Click on the 'more' link to see it all, and the 'Sources' tab to see our URL - +# you can click on the URL to see the original page. +# ----------------------------------------------------------------------------- + +#import scraperwiki +#html = scraperwiki.scrape('https://scraperwiki.com/hello_world.html') +#print html + +# ----------------------------------------------------------------------------- +# In the next tutorial, you'll learn how to extract the useful parts +# from the raw HTML page. +# ----------------------------------------------------------------------------- \ No newline at end of file