> Notes:
> + [Greg Reda: Web Scraping 101 with Python](http://www.gregreda.com/2013/03/03/web-scraping-101-with-python/)
> + [Greg Reda: More web scraping with Python (and a map)](http://www.gregreda.com/2013/04/29/more-web-scraping-with-python/)

In [3]:
# A few scraping rules
# ====================

# (1) You should check a site's terms and conditions before you scrape them. 
# It's their data and they likely have some rules to govern it.

# (2) Be nice - A computer will send web requests much quicker than a user can. 
# Make sure you space out your requests a bit so that you don't hammer the site's server.

# (3) Scrapers break - Sites change their layout all the time. 
# If that happens, be prepared to rewrite your code.

# (4) Web pages are inconsistent - There's sometimes some manual clean up that has to 
# happen even after you've gotten your data.

> Data:
> + Food and Drink section of [Chicago Reader's Best of 2011](http://www.chicagoreader.com/chicago/best-of-chicago-2011/BestOf?oid=4100483) ('Inspect Element' each link)

In [4]:
from bs4 import BeautifulSoup
import requests # using requests instead of urllib2

BASE_URL = "http://www.chicagoreader.com"

# using DRY (don't repeat yourself), refactored url getting into soup
def make_soup(url):
    res = requests.get(url)
    res.raise_for_status()
    return BeautifulSoup(res.text, "lxml")

# getting the category links
def get_category_links(section_url):
    soup = make_soup(section_url)
    boccat = soup.find("dl", "boccat")
    category_links = [BASE_URL + dd.a["href"] for dd in boccat.findAll("dd")]
    return category_links

# getting the category, winner, and runners-up
def get_category_winner(category_url):
    soup = make_soup(category_url)
    category = soup.find("h1", "headline").string
    print('category: {}'.format(category))
    winner = [h2.string for h2 in soup.findAll("h2", "boc1")]
    print('winner: {}'.format(winner))
    runners_up = [h2.string for h2 in soup.findAll("h2", "boc2")]
    print('runners_up: {}'.format(runners_up))
    print('category_url: {}'.format(category_url))
    return {"category": category,
            "category_url": category_url,
            "winner": winner,
            "runners_up": runners_up}

In [5]:
from time import sleep
import pprint # pretty printer

food_n_drink = ("http://www.chicagoreader.com/chicago/best-of-chicago-2011-food-drink/BestOf?oid=4106228")
    
categories = get_category_links(food_n_drink)

data = [] # a list to store our dictionaries
for idx, category in enumerate(categories[:5]):
    print('idx: {}'.format(idx))
    winner = get_category_winner(category)
    data.append(winner)
    print('sleeping for 1ms...')
    sleep(1) # be nice

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(data)


idx: 0
category: Best restaurant that's been around forever and is still worth the trip 
winner: ['Lula Cafe']
runners_up: ['Frontera Grill', 'Chicago Diner  ', 'Sabatino’s', 'Twin Anchors']
category_url: http://www.chicagoreader.com/chicago/BestOf?category=1979894&year=2011
sleeping for 1ms...
idx: 1
category: Best fancy restaurant in Chicago 
winner: ['Alinea ']
runners_up: ['Blackbird', 'Girl & the Goat', 'Green Zebra', 'The Publican']
category_url: http://www.chicagoreader.com/chicago/best-fancy-restaurant-in-chicago/BestOf?oid=4088017
sleeping for 1ms...
idx: 2
category: Best bang for your buck 
winner: ['Big Star', 'Sultan’s Market']
runners_up: ['Frasca Pizzeria & Wine Bar', 'Chutney Joe’s', '"My boyfriend!"']
category_url: http://www.chicagoreader.com/chicago/best-bang-for-your-buck/BestOf?oid=4088018
sleeping for 1ms...
idx: 3
category: Best chef 
winner: ['Rick Bayless (Frontera Grill, Topolobampo, Xoco)']
runners_up: ['Grant Achatz (Alinea, Next, The Aviary)', 'Stephanie Iza

> Data:
> + [Chicago Magazine's Best Sandwiches list](http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/)

In [13]:
# Apart from BeautifulSoup, python scraping libraries worth looking into:
# (1) Scrapy
# (2) PyQuery

# objective: get data and write to a csv file

from bs4 import BeautifulSoup
import requests
import csv

base_url = ("http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/")

soup = make_soup(base_url)
sammies = soup.find_all("div", "sammy")
sammy_urls = [(div.a["href"], div.div.text, div.a.b.text, div.a.text.split('\n')[1].strip()) for div in sammies]

with open("src-best-sandwiches.tsv", "w") as f:
    fieldnames = ("rank", "sandwich", "restaurant", "description", "price",
                    "address", "phone", "website")
    output = csv.writer(f, delimiter="\t")
    output.writerow(fieldnames)

    for (url, rnk, sndwch, rstrnt) in sammy_urls:
        url = url.replace("http://www.chicagomag.com", "")  # inconsistent URL
        page = make_soup("http://www.chicagomag.com{0}".format(url))
        
        rank = rnk
        sandwich = sndwch 
        restaurant = rstrnt 

        description = page.find('div','fb-like fb-like-top').find_next().text.strip()

        addy = page.find('p','addy').em.text.split(',')
        price = addy[0].partition(' ')[0].strip() 
        price = ''.join(price.rsplit('.',1))
        address = addy[0].partition(' ')[2].strip()
        phone = addy[1].strip()
        
        if page.find('p', 'addy').em.a:
            website = page.find('p','addy').em.a['href']
        else:
            website = ''

        output.writerow([rank, sandwich, restaurant, description, price,
                        address, phone, website])

print("Done writing file")

Done writing file


In [None]:
# While the scraper did a good job of getting all of the sandwiches and restaurants, a couple of restaurants 
# had "multiple locations" listed as their address. If we were to need this data, we'll have to find another 
# way to get it (like checking each restaurant's website and manually adding their locations to our dataset). 
# We'll also need to manually fix some oddities that wound up in our data due some inconsistent HTML on the 
# other end (addresses and URLs winding up in the phone numbers column).

# best-sandwiches.tsv with manually corrected "multiple locations" restaurants is available at GitHub repo
# https://github.com/gjreda/best-sandwiches.

# Google Maps API (https://developers.google.com/maps/) can be used to geocode addresses to a set of lat/long points.

In [17]:
# geocoding

import requests
import csv
import json
from time import sleep

def geocode(address):
    url = ("http://maps.googleapis.com/maps/api/geocode/json?sensor=false&address={0}".format(address.replace(" ", "+")))
    return json.loads(requests.get(url).text)

with open("best-sandwiches.tsv", "r") as f:
    reader = csv.DictReader(f, delimiter="\t")

    with open("best-sandwiches-geocode.tsv", "w") as w:
        fields = ["rank", "sandwich", "restaurant", "description", "price",
                 "address", "city", "phone", "website", "full_address",
                 "formatted_address", "lat", "lng"]
        writer = csv.DictWriter(w, fieldnames=fields, delimiter="\t")
        writer.writeheader()

        for line in reader:
            print("Geocoding: {0}".format(line["full_address"]))
            response = geocode(line["full_address"])
            if response["status"] == u"OK":
                results = response.get("results")[0]
                line["formatted_address"] = results["formatted_address"]
                line["lat"] = results["geometry"]["location"]["lat"]
                line["lng"] = results["geometry"]["location"]["lng"]
            else:
                line["formatted_address"] = ""
                line["lat"] = ""
                line["lng"] = ""
            sleep(1)
            writer.writerow(line)

print("Done writing file")

Geocoding: 2109 W. Chicago Ave., Chicago
Geocoding: 800 W. Randolph St., Chicago
Geocoding: 445 N. Clark St., Chicago
Geocoding: 914 Noyes St., Evanston
Geocoding: 825 W. Fulton Mkt., Chicago
Geocoding: 100 E. Walton St., Chicago
Geocoding: 1639 S. Wabash Ave., Chicago
Geocoding: 2211 W. North Ave., Chicago
Geocoding: 3619 W. North Ave., Chicago
Geocoding: 3267 S. Halsted St., Chicago
Geocoding: 2537 N. Kedzie Blvd., Chicago
Geocoding: 252 W. 26th Street, Chicago
Geocoding: 271 N. Weber Road, Bolingbrook
Geocoding: 5160 S. Pulaski Road, Chicago
Geocoding: 9135 W. 159th Street, Orland Hills
Geocoding: 3124 N. Broadway, Chicago
Geocoding: 3455 N. Southport Ave., Chicago
Geocoding: 2657 N. Kedzie Ave., Chicago
Geocoding: 1120 W. Grand Ave., Chicago
Geocoding: 1141 S. Jefferson St., Chicago
Geocoding: 333 E. Benton Pl., Chicago
Geocoding: 1411 N. Wells St., Chicago
Geocoding: 1747 N. Damen Ave., Chicago
Geocoding: 3209 W. Irving Park Rd., Chicago
Geocoding: 1625 N Halsted St., Chicago
Geoc