# Web scraping/indexing example Python code
# Adapted from https://first-web-scraper.readthedocs.io/en/latest/

# This notebook includes additional code "recipes" for loading a page via a headless browser (may be necessary if the page runs a lot of Javascript when rendering its content), for loading a page via the old 'urllib' module, and for setting the user-agent string so that the basic 'requests' module looks more like a browser to the target site

In [None]:
# Standard setup -- run this for everything
import csv
from BeautifulSoup import BeautifulSoup
import time
url = 'http://www.showmeboone.com/sheriff/JailResidents/JailResidents.asp'

In [22]:
### Alternate module imports for headless browsing
#### Note that first you *must* download and install Chromedriver from https://sites.google.com/a/chromium.org/chromedriver/downloads
#### Then do 'pip install selenium'
import os
from selenium import webdriver
chromedriver = "/usr/bin/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
browser = webdriver.Chrome(chromedriver)

In [36]:
### Alternative module import for using the older urllib Python module
import urllib

In [26]:
### Load the page via the standard HTTP requests module
import requests

# Pretend to the site that you're loading the page via a regular browser
# (some sites won't serve a page if they think it's being loaded by a script)
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}

response = requests.get(url, headers=headers)
html = response.content

In [23]:
### Load the page via a headless browser
browser.get(url)
html = browser.page_source
#soup = BeautifulSoup(html_source, 'html.parser')

In [41]:
### Load the page via urllib (this is an older module, some people use it instead of 'requests')
reqData = None
headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
        }

response = urllib.urlopen(url, reqData, headers)
html = response.read()

In [42]:
# Parse the page via BeautifulSoup

soup = BeautifulSoup(html)
#print soup.prettify()
table = soup.find('tbody', attrs={'class': 'stripe'})
        
list_of_rows = []
for row in table.findAll('tr')[1:]:
    #print row.prettify()
    list_of_cells = []
    for cell in row.findAll('td'):
        text = cell.text.replace('&nbsp;', '')
        print text
        if (text != "Details"):
            list_of_cells.append(text)
    list_of_rows.append(list_of_cells)

outfile = open("./inmates.csv", "wb")
writer = csv.writer(outfile)
writer.writerow(["Last", "First", "Middle", "Gender", "Race", "Age", "City", "State"])
writer.writerows(list_of_rows)

Details
ADAMS
STACIE
LEE
F
W
41
COLUMBIA
MO
Details
ALDRIDGE
JAMES
ELBERT III
M
B
28
BOONVILLE
MO
Details
ALEXANDER
BENJAMIN
FRANKLIN
M
B
21
COLUMBIA
MO
Details
ALLEN
LITICIA
MONIQUE
F
B
23
COLUMBIA
MO
Details
ANDERSON
ANDRE
LAMONTE
M
B
29
ST. LOUIS
MO
Details
ANDREWS
JOSEPH
DAMON
M
W
38
COLUMBIA
MO
Details
ARMOUR
AARON
JASON
M
W
30
COLUMBIA
MO
Details
AUBREY
ASHLEY
SUZANNE
F
B
20
PILOT GROVE
MO
Details
BADOLATO
AARON
LEE
M
W
28
CENTRALIA
MO
Details
BANKS
JOHNATHAN
NICOLE
M
B
23
COLUMBIA
MO
Details
BARNEY
MAXRILE
MARKEE
M
B
29
COLUMBIA
MO
Details
BELL
CHRISTOPHER
LEE
M
B
35
COLUMBIA
MO
Details
BENNETT
DEVANTE
JACJUAN
M
B
23
COLUMBIA
MO
Details
BETTS
LAWRENCE
SHERRON
M
B
42
COLUMBIA
MO
Details
BONAPARTE
NATHANIEL
LEROY
M
B
37
COLUMBIA
MO
Details
BONNER
KEVIN
LEVON
M
B
28
COLUMBIA
MO
Details
BOOTH
SHANE
CODY
M
W
28
COLUMBIA
MO
Details
BOWEN
DREW
ALLEN
M
W
20
COLUMBIA
MO
Details
BOWEN
DYLAN
MATTHEW EARL
M
W
25
COLUMBIA
MO
Details
BROUSSARD
DAVID
MICHAEL
M
W
46
COLUMBIA
MO
Details
CAIN
DAV

In [25]:
# If you opened a headless browser, close it
browser.quit()