From 78bbf12e818540049b85c2242a32fe9518be1a84 Mon Sep 17 00:00:00 2001 From: Mark Longair Date: Thu, 15 Sep 2016 16:29:21 +0100 Subject: [PATCH] Initial commit --- .gitignore | 2 ++ requirements.txt | 10 ++++++++++ scraper.py | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+) create mode 100644 .gitignore create mode 100644 requirements.txt create mode 100755 scraper.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..77bf0ba --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/data.sqlite +*~ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..bbc56b8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +# It's easy to add more libraries or choose different versions. Any libraries +# specified here will be installed and made available to your morph.io scraper. +# Find out more: https://morph.io/documentation/python + +# Custom version of scraperwiki library +-e git+http://github.com/openaustralia/scraperwiki-python.git@morph_defaults#egg=scraperwiki + +lxml==3.4.4 +cssselect==0.9.1 +requests==2.7.0 diff --git a/scraper.py b/scraper.py new file mode 100755 index 0000000..3cda71b --- /dev/null +++ b/scraper.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python + +import csv +import re +from urlparse import urlsplit + +import requests +import scraperwiki + +'''This "scraper" just changes the columns in the YourNextMP elected +candidates data from the UK 2015 general election''' + +url = 'https://candidates.democracyclub.org.uk/media/candidates-elected-2015.csv' + +r = requests.get(url, stream=True) + +for row in csv.DictReader(r.raw): + parlparse_person_id = re.sub(r'^.*/(\d+)$', r'\1', row['parlparse_id']) + wikiname = '' + if row['wikipedia_url']: + split = urlsplit(row['wikipedia_url']) + wikiname = split.path[len('/wiki/'):] + wikiname.replace('_', ' ') + scraperwiki.sqlite.save( + unique_keys=['id'], + data={ + 'id': parlparse_person_id, + 'name': row['name'], + 'twitter': row['twitter_username'], + 'facebook': row['facebook_page_url'], + 'wikipedia': row['wikipedia_url'], + 'wikiname': wikiname, + 'birth_date': row['birth_date'], + 'linkedin': row['linkedin_url'], + 'image': row['image_url'], + } + )