In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import csv
import time
import urllib
import re
import ast
import json

In [2]:
# open private input data file for cityscape
with open('inputs.txt') as f:
    inputs = ast.literal_eval(f.read())

In [3]:
# Reclaim login credentials for chicago cityscape
main_page = 'https://www.chicagocityscape.com'
ord_page = 'https://www.chicagocityscape.com/ordinances.php'

In [4]:
def initialize_driver(main_page):
    driver = webdriver.Firefox()
    driver.get(main_page)
    wait = WebDriverWait(driver, 30)
    return driver, wait

def login_cityscape(driver, wait, email, password):  
    # Find sign-in by Google button and click it
    elem = driver.find_element_by_partial_link_text('Sign in')
    elem.click()
    time.sleep(2)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "fa-ul")))
    # Find google sign in
    goo = driver.find_element_by_partial_link_text('Google')
    goo.click()
    time.sleep(2)

    # Enter Email address and submit
    wait.until(EC.presence_of_element_located((By.ID, "headingText")))
    email_input = driver.find_element_by_id("identifierId")
    email_input.send_keys(email)
    email_submit = driver.find_element_by_id("identifierNext").click()
    time.sleep(2)

    
    # Enter Password and submit
    wait.until(EC.presence_of_element_located((By.ID, "password")))
    pw_input = driver.find_element_by_name("password")
    pw_input.send_keys(password)
    pw_submit = driver.find_element_by_id("passwordNext").click()
    time.sleep(10)
    
    return driver, wait

In [5]:
def search_ord(driver, wait, ord_page, ord_num):
    # Go to ordinance search page
    driver.get(ord_page)
    
    # Input ord num in search bar
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "form-inline")))
    ord_input = driver.find_element_by_id('search_term')
    ord_input.send_keys(ord_num)
    key = driver.find_element_by_class_name('btn')
    key.click()
    time.sleep(2)

    # Find Details link for ordinance
    wait.until(EC.presence_of_element_located((By.ID, "table_ordinances_wrapper")))
    ord_link = driver.find_element_by_partial_link_text('Details')
    ord_link.click()
    time.sleep(10)
    return driver, wait

def find_ord_id(driver, ord_page, ord_dict):
    # get search page html to find ordinance id
    ord_details = driver.page_source
    soup = BeautifulSoup(ord_details, "html5lib")
    link = ''
    for s in soup.find_all('a'):
        links = s.get('href')
        if links:
            if '/ordinances.php?ordinance=' in links:
                link = links
    ord_suffix = link.split('php')[1]
    ord_dict['ord_code_page'] = ord_page + ord_suffix
    ord_dict['ord_code'] = ord_suffix[-4:]
    return driver, ord_dict

In [6]:
def get_ord_html(driver, ord_dict):
    driver.get(ord_dict['ord_code_page'])
    html = driver.page_source
    soup = BeautifulSoup(html, "html5lib")
    return soup

def get_ord_summary(soup, ord_dict):
    # get ordinance summary text; add it to dict
    for s in soup.find_all('h3'):
        if s.next_element == 'Ordinance summary ':
            ord_dict['summary'] = s.next_sibling.next_sibling.text
    return ord_dict 

def get_ord_players(soup, ord_dict):
    # extract 'more details and place in dict', including key players
    for s in soup.find_all('dl', class_='dl-horizontal'):
        for d in s.find_all('dt'):
            ord_dict[d.text] = d.next_sibling.text  
    return ord_dict

def get_ord_places(soup, ord_dict):
    place_list = ['Community Area', 'Neighborhood', 'ZIP Code']
    ward_list = ['Ward (Chicago)']
    for s in soup.find_all('div', id='boundaries_printer_wrapper'):
        for t in s.find_all('td', class_='sorting_3'):
            place = re.split('\s\s',t.text)[0]
            category = re.split('\s\s',t.next_sibling.text)[0]
            place = place.split(',')
            if category in place_list:
                ord_dict[category] = place
            if category in ward_list:
                ord_dict['Ward'] = place[0]
                ord_dict['Ward_Alderman'] = place[1]
    return ord_dict

def get_all_ord_info(driver, ord_dict):
    soup = get_ord_html(driver, ord_dict)
    ord_dict = get_ord_summary(soup, ord_dict)
    ord_dict = get_ord_players(soup, ord_dict)
    ord_dict = get_ord_places(soup, ord_dict)
    return ord_dict

In [7]:
# debugging
# with open('ord_dicts.txt') as f:
#     data = ast.literal_eval(f.read())
# entry = data[0]

# driver, wait = initialize_driver(main_page)
# driver, wait = login_cityscape(driver, wait, inputs['email'], inputs['password'])
# ord_num = entry['ord']
# driver, wait = search_ord(driver, wait, ord_page, ord_num)
# driver, entry = find_ord_id(driver, ord_page, entry)
# entry = get_all_ord_info(driver, entry)
# print(entry)

In [8]:
# open data from chicouncilmatic
with open('ord_dicts.txt') as f:
    data = ast.literal_eval(f.read())
ord_list_full = []
ord_nums_full = []

In [9]:
x=1
if x==1:
    # Run this if appending to existing file
    # open data file from cityscape w/ complete info
    with open('ord_info_full.txt') as f:
        ord_full = ast.literal_eval(f.read())
    # append each item to ord_list_full
    for item in ord_full:
        ord_list_full.append(item)
        ord_nums_full.append(item['ord'])     
    print(len(ord_nums_full))

ord_list_partial = []
ord_nums_partial = []
# # open data file that contains only partial info (no additional info on cityscape)
#     with open('ord_info_partial.txt') as f:
#         ord_partial = ast.literal_eval(f.read())

#     # append each item to ord_list_partial
#     for item in ord_partial:
#         ord_list_partial.append(item)
#         ord_nums_partial.append(item['ord'])     
#     print(len(ord_nums_partial))

7


In [None]:
# Initialize driver & log in to cityscape
driver, wait = initialize_driver(main_page)
driver, wait = login_cityscape(driver, wait, inputs['email'], inputs['password'])
# Iterate through ord numbers
for entry in data:
    ord_num = entry['ord']
    
    # skip ords that are already in full text file
    if ord_num in ord_nums_full:
        continue;
    
    try:
        driver, wait = search_ord(driver, wait, ord_page, ord_num)
        driver, entry = find_ord_id(driver, ord_page, entry)
        entry = get_all_ord_info(driver, entry)
        ord_list_full.append(entry)
    except:
        ord_list_partial.append(entry)
        continue

In [None]:
# write lists to disk as jsons (list of dicts)
with open('ord_info_full.txt', 'w') as fout:
    fout.write(json.dumps(ord_list_full, indent=4, sort_keys=True, default=str))

In [None]:
# write lists to disk as jsons (list of dicts)
with open('ord_info_partial.txt', 'w') as fout:
    fout.write(json.dumps(ord_list_partial, indent=4, sort_keys=True, default=str))