In [1]:
import json
import re

from bs4 import BeautifulSoup
from collections import namedtuple
from urllib import urlopen

In [2]:
BASE_URL = 'http://info.mzalendo.com'
PROFILES_ENDPOINT = '/position/member-national-assembly/'
NUM_PROFILES = 348
MAX_PER_PAGE = 10
PAGES = (NUM_PROFILES // MAX_PER_PAGE) + 1
DATA_FILE = 'political_persons_ke_2013_2017.txt'

In [3]:
def make_soup(url):
    try:
        html = urlopen(url).read()
        return BeautifulSoup(html, "lxml")
    except IOError:
        pass
    
def list_per_page(soup):
    members = soup.find('ul', 'position-listing').findAll('li')
    return [member.a.get('href') for member in members]

profiles = []
for page in range(1, PAGES + 1):
    url = '{}{}?page={}'.format(BASE_URL, PROFILES_ENDPOINT, page)
    print 'processing: {}'.format(url)
    soup = make_soup(url)
    profiles += list_per_page(soup)

assert len(profiles) == NUM_PROFILES, 'not all persons fetched'

processing: http://info.mzalendo.com/position/member-national-assembly/?page=1
processing: http://info.mzalendo.com/position/member-national-assembly/?page=2
processing: http://info.mzalendo.com/position/member-national-assembly/?page=3
processing: http://info.mzalendo.com/position/member-national-assembly/?page=4
processing: http://info.mzalendo.com/position/member-national-assembly/?page=5
processing: http://info.mzalendo.com/position/member-national-assembly/?page=6
processing: http://info.mzalendo.com/position/member-national-assembly/?page=7
processing: http://info.mzalendo.com/position/member-national-assembly/?page=8
processing: http://info.mzalendo.com/position/member-national-assembly/?page=9
processing: http://info.mzalendo.com/position/member-national-assembly/?page=10
processing: http://info.mzalendo.com/position/member-national-assembly/?page=11
processing: http://info.mzalendo.com/position/member-national-assembly/?page=12
processing: http://info.mzalendo.com/position/mem

In [14]:
def profile_attributes(soup):
    Contact = namedtuple('Contact', ['name', 'dob', 'email', 'cell', 'area', 'position', 'attendance'])
    
    # name
    name = soup.select('div .object-titles')
    profile_name = name[0].h1.get_text(strip=True)
    
    # date of birth, email, cellphone
    contacts_contents = soup.select('div .contact-details > p')
    contacts_headers = soup.select('div .contact-details > h3')
    contacts_headers = [header.get_text().lower() for header in contacts_headers]
    contacts_contents = [contact.get_text() for contact in contacts_contents]
    contacts = dict(zip(contacts_headers, contacts_contents))
    
    # title, area represented
    title = soup.select('div .person-detail-experience > ul')
    position = [re.sub(' +', ' ', text.replace('\n', ''))
                for text in title[0].h4.stripped_strings]
    if len(position) == 1:
        # nominated member
        position.append(u'nominated')
    
    # house appearances
    attendance = soup.select('div .person-detail-hansard > a')
    if attendance:
        attendance = re.search('\d+', attendance[0].get_text()).group()
    else:
        attendance = 0
    
    # bills
    # use more accurate data here: http://kenyalaw.org/kl/
    
    contact = Contact(
        profile_name,
        contacts.get('born', ''),
        contacts.get('email', ''),
        contacts.get('telephone', ''),
        position[1],
        position[0],
        attendance)
    
    return json.dumps(contact._asdict())

# url = '{}{}'.format(BASE_URL, profiles[336])
# soup = make_soup(url)
# print profile_attributes(soup)

In [15]:
#profiles = profiles[:3]
# retry making soup

def fetch_all(profiles):
    for profile in profiles:
        url = '{}{}'.format(BASE_URL, profile)
        soup = make_soup(url)
        attrs = profile_attributes(soup)
        
        # write to file
        with open(DATA_FILE, 'a') as fh:
            print attrs
            fh.write(attrs + '\n')
            
            
#print len(profiles)
fetch_all(profiles)

{"name": "Samuel Kiprono Chepkonga", "dob": "4th September 1964", "email": "chepkonga@wananchi.com ", "cell": "0722996469", "area": "Ainabkoi", "position": "Member of the National Assembly for", "attendance": "2789"}
{"name": "Benjamin Kipkirui Langat", "dob": "24th November 1976", "email": "lkbenjami@yahoo.com ", "cell": "0722895939", "area": "Ainamoi", "position": "Member of the National Assembly for", "attendance": "2063"}
{"name": "Cornelly Serem", "dob": "31st December 1970", "email": "cornel66101@yahoo.com ", "cell": "0720826869", "area": "Aldai", "position": "Member of the National Assembly for", "attendance": "135"}
{"name": "George Washinton Mallan Omondi", "dob": "14th September 1953", "email": "lvfkaruoth@yahoo.co.uk ", "cell": "0722778509", "area": "Alego Usonga", "position": "Member of the National Assembly for", "attendance": "197"}
{"name": "Jared Odhiambo Opiyo", "dob": "1st June 1973", "email": "jaredsandy22@yahoo.com ", "cell": "0703336111", "area": "Awendo", "positio