In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os.path
from glob import glob
import bs4
import urllib

In [2]:
base = 'https://chicago.councilmatic.org/'
suffix = '/search/?selected_facets=topics_exact%3AZoning+Reclassification'
page_query = '&page={num}'
resp = urllib.request.urlopen(base + suffix + page_query.format(num=1))
soup = bs4.BeautifulSoup(resp.read(), "html5lib")

In [4]:
def get_ords(soup):
    '''get each ordinance, date, status, & link'''
    ord_info = {}
    for link in soup.find_all('a', class_='small'):
        ord_info['ord'] = link.contents[0].split()[1]
        ord_info['href'] = link.get('href')
        ord_info['status']:str(link.next_element.next_element.next_element.string).strip()

    for date in soup.find_all('i', class_='fa fa-fw fa-calendar-o'):
        ord_info['date'] = date.next_element.split('-')[0].strip()
    return ord_info


def get_locs(inner_soup, ord_info):
    '''get locations mentioned in ordinance'''
    locs = []
    for inner_link in inner_soup.find_all('ul', class_='list-unstyled'):
        if str(inner_link.previous_element.previous_element).strip() == 'Locations mentioned':
            for list_item in inner_link.find_all('li'):
                locs.append(list_item.string.strip())
    ord_info['locations'] = locs
    return ord_info['locations']


def get_sponsors(inner_soup):
    sponsors = []
    wards = []
    inner_link = inner_soup.find('div', class_='table-responsive')
    spo = 1
    war = 1
    for col in inner_link.find_all('td'):
        if col.find('a'):
            sponsors.append(col.a.string.strip())
            spo += 1
        if col.string:
            wards.append(col.string.strip())
            war +=1
    return sponsors,wards

In [None]:
# loop below is currently doing one ord per page: need to adjust to loop through each page....

In [7]:
ord_list = []
page_num = 0
while True:
    page_num = page_num+1
    try:
        resp = urllib.request.urlopen(base + suffix + page_query.format(num=page_num))
        soup = bs4.BeautifulSoup(resp.read(), "html5lib")
        
#         get ord info & link
        ord_info = get_ords(soup)
        ord_info['year']= int(ord_info['date'][-4:])
        if ord_info['year'] < 2015:
            break
        
        # go to ordinance page to get additional info
        inner_resp = urllib.request.urlopen(base + ord_info['href'])
        inner_soup = bs4.BeautifulSoup(inner_resp.read(), "html5lib")
#         get location
        ord_info['locations'] = get_locs(inner_soup, ord_info)
        sponsors, spo_wards = get_sponsors(inner_soup)
        ord_info['sponsors'] = sponsors
        ord_info['sponsor_ward'] = spo_wards
        
        
#         append dict to list
        ord_list.append(ord_info)
    except:
        break


In [8]:
ord_list

[{'ord': 'O2019-365',
  'href': '/legislation/o2019-365/',
  'date': '1/23/2019',
  'year': 2019,
  'locations': ['4737-4739 N Pulaski Rd Chicago'],
  'sponsors': ['Laurino, Margaret'],
  'sponsor_ward': ['Ward 39']},
 {'ord': 'O2019-294',
  'href': '/legislation/o2019-294/',
  'date': '1/23/2019',
  'year': 2019,
  'locations': ['1443 W Augusta Blvd Chicago'],
  'sponsors': ['Committee on Zoning, Landmarks and Building Standards',
   'Chicago City Council'],
  'sponsor_ward': []},
 {'ord': 'O2019-266',
  'href': '/legislation/o2019-266/',
  'date': '1/23/2019',
  'year': 2019,
  'locations': ['2222 W 21st St Chicago'],
  'sponsors': ['Committee on Zoning, Landmarks and Building Standards',
   'Chicago City Council'],
  'sponsor_ward': []},
 {'ord': 'O2019-304',
  'href': '/legislation/o2019-304/',
  'date': '1/23/2019',
  'year': 2019,
  'locations': [],
  'sponsors': ['Committee on Zoning, Landmarks and Building Standards',
   'Chicago City Council'],
  'sponsor_ward': []},
 {'ord': 