## Web Scraping Tutorial

In this tutorial, we will be learning how to scrape data from an HTML page. Note that different websites will have different policies regarding scraping. 

We will be scraping data that is provided by the Stanford Open Policing Project. They have compiled and released traffic stop data from almost all states in the U.S.

https://openpolicing.stanford.edu/

The data itself can be found here:

https://openpolicing.stanford.edu/data/

In [None]:
from lxml import html, etree
import requests
import wget
import pdb
import argparse
import time
import os 

In [None]:
OPP_URL = "https://openpolicing.stanford.edu/data/"
OPP_HTML = "opp_root.html"
OPP_DATA_ROOT = "data/"
OPP_DL_CITIES = "opp_downloaded_cities.txt"
INITIAL_URL = True

In [None]:
# get initial url and save to file
if INITIAL_URL: 
    with open (OPP_HTML, "w") as f:
        data = requests.get(OPP_URL)
        f.write(data.text)

In [None]:
# create a html parser instance
parser = etree.HTMLParser()

# open the file to start parsing
with open(OPP_HTML, "r") as f, open (OPP_DL_CITIES, "w") as dl_city_list: 
    tree = etree.parse(f, parser)

    # find all the states listed 
    states = tree.xpath(".//tbody")

    # go through each state in the list
    for state in states: 
        state_elem = state.xpath(".//tr")

        # state title contained in first list element, other tr elements are cities 
        # print the state name 
        state_name = state_elem[0].xpath(".//td")[0].text
        print ("state name ", state_name)

        # need to ignore the first city in the list, since it's the state name
        # so we skip the first element
        cities = iter(state_elem)
        next(cities)

        for city in cities: 
            # find the name of the city first 
            city_name_elem = city.xpath(".//td[contains(@data-title, 'State')]")[0]
            city_name = city_name_elem.xpath(".//span")[0].text
            print(city_name)

            # find the download link for this file
            city_csv_url = city.xpath(".//td[contains(@data-title, 'Download')]")[0].xpath(".//a[contains(@title,'Download data as CSV')]/@href")
            print (city_csv_url[0])

            # we need to check for the existance of certain attributes 
            # (we want driver age, citation issued, warning issued, arrest made)
            check_attribs = ["Driver age", "Citation issued", "Warning issued", "Arrest made"]
            # assume we want to download this file unless told otherwise
            to_download = True
            for attrib in check_attribs:
                print (attrib)
                # check to see if attribute has a child (indicating that this attribute is present in data)
                attrib_elem = city.xpath(".//td[contains(@data-title, '%s')]" %(attrib))

                attrib_elem_des = attrib_elem[0].xpath(".//descendant::*")
                # if the attribute does not have any children, we don't want to download this file
                if not attrib_elem_des:
                    to_download = False
                    
            # if we want to download this file, download and write city name to file
            if to_download:
                # make directory for state
                os.makedirs(OPP_DATA_ROOT+state_name, exist_ok = True)
                
                # try to download file and print error if unable to 
                try: 
                    wget.download(city_csv_url[0], OPP_DATA_ROOT+state_name)
                    time.sleep(10)
                except: 
                    print ("couldn't download the file ", city_csv_url[0])
                dl_city_list.write(state_name + "_" + city_name + "\n")