In [1]:
import pandas as pd
from selenium import webdriver
from pathlib import Path
from datetime import datetime as dt 
today = dt.today().strftime("%Y-%m-%d")

# THIS_FILE = Path(__file__)
# THIS_DIR = THIS_FILE.parent
# CHROMEDRIVER_PATH = THIS_DIR.joinpath("./../../notebooks/chromedriver")
# INTERIM_DATA = THIS_DIR.joinpath("./../../data/interim/")
# PROCESSED_DATA = THIS_DIR.joinpath("./../../data/processed/")

BASE_URL = "http://force.nj.com"

In [2]:
browser = webdriver.Chrome("./chromedriver")
# Compile list police dept of options from dropdown menu
browser.get(BASE_URL)

police_depts_list = browser.find_element_by_id("mylist")
police_depts = police_depts_list.find_elements_by_tag_name("option")

In [3]:
depts_list = []
for dept in police_depts:
    dept_info = {}
    dept_info['name'] = dept.get_attribute("innerHTML").strip()
    dept_info['relative_url'] = dept.get_attribute("value")
    depts_list.append(dept_info)

In [4]:
def get_likelihood_numbers(div_name = 'left'):
    """Retrieves 'likelihood of use of force' numbers from force.nj.com website. These numbers are in 2 divs inside the 'race_breakdown' div.

    Parameters
    ----------
    div_name : str, optional
        Either the left or the right div, by default 'left'

    Returns
    -------
    touple : touple
        A touple containing the property name and likelihood number (text) for either the 'left' or 'right' div.
    """
    race_breakdown = browser.find_element_by_class_name("racial_breakdown")
    div = race_breakdown.find_element_by_class_name(div_name)
    based_on = div.find_element_by_class_name("important_num2").get_property("innerHTML").strip().replace(",", "")
    likelihood = div.find_element_by_class_name("important_num1").get_property("innerHTML").strip().replace(",", "")
    more_or_less = div.find_elements_by_class_name("important_num2")[-1].get_property("innerHTML").strip()

    return (based_on, likelihood, more_or_less)

In [5]:
def get_flagging_officer_numbers():
    """Retrieve the 'number of officers that would be flagged in other agencies' from force.nj.com website. These numbers are inside the 'earlywarning' div.

    Returns
    -------
    touple : touple
        A six-part touple containing the city name and number for Los Angeles, New York City, and Chicago.
    """
    early_warning = browser.find_element_by_class_name("earlywarning")
    elements = early_warning.find_elements_by_class_name("important_num_red2")
    if len(elements) == 6:
        first_city = elements[0].get_property("innerHTML").strip().replace("'s", "").replace("'", "")
        first_city_n = elements[1].get_property("innerHTML").strip()
        second_city = elements[2].get_property("innerHTML").strip().replace("'s", "").replace("'", "")
        second_city_n = elements[3].get_property("innerHTML").strip()
        third_city = elements[4].get_property("innerHTML").strip().replace("'s", "").replace("'", "")
        third_city_n = elements[5].get_property("innerHTML").strip()
    return (first_city, first_city_n, second_city, second_city_n, third_city, third_city_n)

In [6]:
browser.get(BASE_URL + depts_list[0]['relative_url'])

In [7]:
get_likelihood_numbers()

('population', '294%', 'more likely')

In [8]:
def get_rate_of_force():
    """Retrieve the rate of force ('this dept uses force at a rate higher than X other depts').
        
    Returns
    -------
    sentnce : str
       `innerHTML` of h3 inside the rank_five_years section.
    """
    rank_five_years = browser.find_element_by_id("rank_five_years")
    rate_of_force_sentence = rank_five_years.find_element_by_class_name('third_label').get_property('innerHTML')
    clean_sentence = " ".join(rate_of_force_sentence.split()).replace("<div class=\"important_num2\">", "").replace("</div>", "")
    return clean_sentence

In [9]:
get_rate_of_force()

'This department uses force at higher rate than 251 police departments'

In [10]:
def get_pd_info():
    """Retrieve the rate of force ('this dept uses force at a rate higher than X other depts').
        
    Returns
    -------
    sentnce : str
       `innerHTML` of h3 inside the rank_five_years section.
    """
    pd_info = browser.find_element_by_class_name("pd_info")
    h1 = pd_info.find_element_by_class_name("biggest_hed").get_property("innerHTML")
    second_label = pd_info.find_element_by_class_name("second_label").get_property("innerHTML")
    patrol_area = pd_info.find_element_by_class_name("town_description").find_element_by_class_name("left").find_element_by_class_name("town_label").get_property('innerHTML')

    return (h1, second_label, patrol_area)

In [11]:
get_pd_info()

('Aberdeen', 'Monmouth County', 'Patrol area: Aberdeen')

In [30]:
def get_incidents_table():
    """Retrieve the incidents table presented by default in each police depts page.
        
    Returns
    -------
    table : pd.DataFrame
       table containing `n_rows` officers per dept
    """
    incidents_table = browser.find_element_by_id("incidents_table").get_property("outerHTML")
    table = pd.read_html(incidents_table)[0]
    voi = [col for col in table if 'Unnamed' not in col]
    return table[voi]

In [31]:
get_incidents_table()

Unnamed: 0,Officer name,Date,Time,Type of force,Reason,Subject age,Subject sex,Subject race,Officer race
0,Michael Lasko,12/6/16,22:02,compliance hold,"resisted police officer control, physical thre...",55,male,white,white
1,Michael Plant,10/21/16,16:05,"compliance hold, hands/fists","resisted police officer control, physical thre...",38,female,white,white
2,Michael Rzigalinski,10/6/16,11:01,compliance hold,"resisted police officer control, physical thre...",43,male,white,white
3,Gus Grivas,10/6/16,11:00,compliance hold,"resisted police officer control, physical thre...",43,male,white,white
4,Raymond Campbell,10/3/16,18:37,compliance hold,resisted police officer control,18,male,black,white
5,John Young,10/3/16,18:35,compliance hold,resisted police officer control,18,male,black,white
6,David Mauro,10/3/16,18:38,compliance hold,resisted police officer control,18,male,black,white
7,Gus Grivas,7/20/16,11:23,compliance hold,resisted police officer control,58,female,white,white
8,Henry Chevalier,7/20/16,11:16,compliance hold,resisted police officer control,58,female,white,white
9,Christopher Desarno,6/17/16,12:08,"compliance hold, hands/fists","resisted police officer control, physical thre...",57,female,white,white


In [28]:
df = pd.read_html(table)[0]

In [29]:
df

Unnamed: 0.1,Unnamed: 0,Officer name,Date,Time,Type of force,Reason,Subject age,Subject sex,Subject race,Officer race
0,,Michael Lasko,12/6/16,22:02,compliance hold,"resisted police officer control, physical thre...",55,male,white,white
1,,Michael Plant,10/21/16,16:05,"compliance hold, hands/fists","resisted police officer control, physical thre...",38,female,white,white
2,,Michael Rzigalinski,10/6/16,11:01,compliance hold,"resisted police officer control, physical thre...",43,male,white,white
3,,Gus Grivas,10/6/16,11:00,compliance hold,"resisted police officer control, physical thre...",43,male,white,white
4,,Raymond Campbell,10/3/16,18:37,compliance hold,resisted police officer control,18,male,black,white
5,,John Young,10/3/16,18:35,compliance hold,resisted police officer control,18,male,black,white
6,,David Mauro,10/3/16,18:38,compliance hold,resisted police officer control,18,male,black,white
7,,Gus Grivas,7/20/16,11:23,compliance hold,resisted police officer control,58,female,white,white
8,,Henry Chevalier,7/20/16,11:16,compliance hold,resisted police officer control,58,female,white,white
9,,Christopher Desarno,6/17/16,12:08,"compliance hold, hands/fists","resisted police officer control, physical thre...",57,female,white,white


In [9]:
pd_info = browser.find_element_by_class_name("pd_info")
town_info = pd_info.find_element_by_class_name("town_description")
town_info.find_element_by_class_name("right").find_element_by_class_name("town_label").get_property("innerHTML")

'Average number of full-time officers: 34'

In [6]:
for dept in depts_list[:15]:
    # go to page
    browser.get(BASE_URL + dept['relative_url'])

    print(f"Getting info for {dept['name']}")
    
    # get likelihood numbers
    # by population
    try:
        (prop, number) = get_likelihood_numbers('left')
        dept[prop] = number
    except:
        dept['population'] = 'Not found'
    # by arrests
    try:
        (prop, number) = get_likelihood_numbers('right')
        dept[prop] = number
    except:
        dept['arrests'] = 'Not found'
    
    # get no of officers that would be flagged in other cities
    try:
        (first_city, first_city_n, second_city, second_city_n, third_city, third_city_n) = get_flagging_officer_numbers()
        dept[first_city] = first_city_n
        dept[second_city] = second_city_n
        dept[third_city] = third_city_n
    except:
        dept['Los Angeles'] = 'Not found'
        dept['New York City'] = 'Not found'
        dept['Chicago'] = 'Not found'

browser.close()

Getting info for Aberdeen, Monmouth
Getting info for Absecon, Atlantic
Getting info for Allendale, Bergen
Getting info for Allenhurst, Monmouth
Getting info for Allentown, Monmouth
Getting info for Alpine, Bergen
Getting info for Andover Township, Sussex
Getting info for Asbury Park, Monmouth
Getting info for Atlantic City, Atlantic
Getting info for Atlantic Highlands, Monmouth
Getting info for Audubon, Camden
Getting info for Avalon, Cape May
Getting info for Avon-By-the-Sea, Monmouth
Getting info for Barnegat, Ocean
Getting info for Barrington, Camden


In [7]:
data = pd.DataFrame(depts_list)
data['full_url'] = BASE_URL + data['relative_url']

In [8]:
data

Unnamed: 0,name,relative_url,population,arrests,Los Angeles,New York City,Chicago,full_url
0,"Aberdeen, Monmouth",/database/pd-dept/aberdeen-monmouth,294%,42%,Not found,Not found,Not found,http://force.nj.com/database/pd-dept/aberdeen-...
1,"Absecon, Atlantic",/database/pd-dept/absecon-atlantic,583%,68%,0,9,13,http://force.nj.com/database/pd-dept/absecon-a...
2,"Allendale, Bergen",/database/pd-dept/allendale-bergen,,8%,0,2,3,http://force.nj.com/database/pd-dept/allendale...
3,"Allenhurst, Monmouth",/database/pd-dept/allenhurst-monmouth,,,0,4,4,http://force.nj.com/database/pd-dept/allenhurs...
4,"Allentown, Monmouth",/database/pd-dept/allentown-monmouth,,,0,1,1,http://force.nj.com/database/pd-dept/allentown...
...,...,...,...,...,...,...,...,...
542,"Knowlton Township, Warren",/database/pd-dept/knowlton-township-warren,,,,,,http://force.nj.com/database/pd-dept/knowlton-...
543,"Liberty Township, Warren",/database/pd-dept/liberty-township-warren,,,,,,http://force.nj.com/database/pd-dept/liberty-t...
544,"Oxford, Warren",/database/pd-dept/oxford-warren,,,,,,http://force.nj.com/database/pd-dept/oxford-wa...
545,"White, Warren",/database/pd-dept/white-warren,,,,,,http://force.nj.com/database/pd-dept/white-warren
