# Cleaning up the code

Steps:
1. Compile list of possible police-dept options from dropdown menu search box
2. From each one retrieve:
  - Likelihood of force (black vs white) based on population
  - Likelihood of force (black vs white) based on arrests
  - No. of police officers that would be flagged under LAPD standards
  - No. of police officers that would be flagged under NYPD standards
  - No. of police officers that would be flagged under Chicago PD standards
  
Notes: No. of police officers flagged only appears when the page is visited by a browser. If you use `requests` the `<div>` where these numbers would appears shows up empty. 

In [1]:
import pandas as pd
from selenium import webdriver
import requests
from pathlib import Path
from tools import tree
from datetime import datetime as dt

today = dt.today().strftime("%Y-%m-%d")

In [2]:
RAW_DATA = Path("../data/raw/")
INTERIM_DATA = Path("../data/interim/")
PROCESSED_DATA = Path("../data/processed/")
FINAL_DATA = Path("../data/final/")

In [5]:
BASE_URL = 'http://force.nj.com'

In [4]:
browser = webdriver.Chrome("./chromedriver")

In [6]:
browser.get(BASE_URL)

In [13]:
police_depts_list = browser.find_element_by_id("mylist")

In [15]:
police_depts = police_depts_list.find_elements_by_tag_name("option")

In [27]:
test_option = police_depts[0]

In [28]:
test_option.get_attribute('value')

'/database/pd-dept/aberdeen-monmouth'

In [29]:
test_option.get_attribute('innerHTML')

'Aberdeen, Monmouth '

In [25]:
depts_list = []
for dept in police_depts:
    dept_info = {}
    dept_info['name'] = dept.get_attribute("innerHTML").strip()
    dept_info['relative_url'] = dept.get_attribute("value")
    depts_list.append(dept_info)

In [30]:
test_dept = depts_list[0]

In [31]:
browser.get(BASE_URL + test_dept['relative_url'])

In [32]:
rbk = browser.find_element_by_class_name("racial_breakdown")

In [39]:
left_div = rbk.find_element_by_class_name("left")
right_div = rbk.find_element_by_class_name("right")

In [41]:
left_div.find_element_by_class_name("important_num2").get_property("innerHTML")

'population,'

In [42]:
left_div.find_element_by_class_name("important_num1").get_property("innerHTML")

'\n\n\n              294%\n\n             \n\n            '

In [43]:
def get_likelihood_numbers(div_name = 'left'):
    race_breakdown = browser.find_element_by_class_name("racial_breakdown")
    div = rbk.find_element_by_class_name(div_name)
    based_on = div.find_element_by_class_name("important_num2").get_property("innerHTML").strip()
    likelihood = div.find_element_by_class_name("important_num1").get_property("innerHTML").strip()
    return (based_on, likelihood)

In [45]:
get_likelihood_numbers('right')

('arrests,', '42%')

In [52]:
early_warning = browser.find_element_by_class_name("earlywarning")

In [49]:
early_warning.find_elements_by_class_name("important_num_red2")

[<selenium.webdriver.remote.webelement.WebElement (session="64856079453687ccc5975b303d3cdba1", element="ef8aa835-d3ae-490e-a266-3c2f54b05b0d")>,
 <selenium.webdriver.remote.webelement.WebElement (session="64856079453687ccc5975b303d3cdba1", element="734c88c6-f1e1-4d6a-818f-45b6f296d5aa")>,
 <selenium.webdriver.remote.webelement.WebElement (session="64856079453687ccc5975b303d3cdba1", element="157af821-ae8f-4c0d-a194-e41c18c3578f")>,
 <selenium.webdriver.remote.webelement.WebElement (session="64856079453687ccc5975b303d3cdba1", element="5d1c2f2d-7d62-4dc3-a9f9-0919805508ee")>,
 <selenium.webdriver.remote.webelement.WebElement (session="64856079453687ccc5975b303d3cdba1", element="07350254-f2a0-4e83-a94c-01fe7cca4d70")>,
 <selenium.webdriver.remote.webelement.WebElement (session="64856079453687ccc5975b303d3cdba1", element="22c03eca-fefd-4e46-a096-5a8c93660342")>]

In [53]:
def get_flagging_officer_numbers():
    early_warning = browser.find_element_by_class_name("earlywarning")
    elements = early_warning.find_elements_by_class_name("important_num_red2")
    if len(elements) == 6:
        first_city = elements[0].get_property("innerHTML")
        first_city_n = elements[1].get_property("innerHTML")
        second_city = elements[2].get_property("innerHTML")
        second_city_n = elements[3].get_property("innerHTML")
        third_city = elements[4].get_property("innerHTML")
        third_city_n = elements[5].get_property("innerHTML")
    return (first_city, first_city_n, second_city, second_city_n, third_city, third_city_n)

In [54]:
get_flagging_officer_numbers()

("Los Angeles'", '4', "New York City's", '15', "Chicago's", '16')

In [56]:
for dept in depts_list:
    # go to page
    browser.get(BASE_URL + dept['relative_url'])
    
    # get likelihood numbers
    # by population
    try:
        (prop, number) = get_likelihood_numbers('left')
        dept[prop] = number
    except:
        dept['population'] = 'Not found'
    # by arrests
    try:
        (prop, number) = get_likelihood_numbers('right')
        dept[prop] = number
    except:
        dept['arrests'] = 'Not found'
    
    # get no of officers that would be flagged in other cities
    try:
        (first_city, first_city_n, second_city, second_city_n, third_city, third_city_n) = get_flagging_officer_numbers()
        dept[first_city] = first_city_n
        dept[second_city] = second_city_n
        dept[third_city] = third_city_n
    except:
        dept['Los Angeles'] = 'Not found'
        dept['New York City'] = 'Not found'
        dept['Chicago'] = 'Not found'
        

In [58]:
pd.DataFrame(depts_list)

Unnamed: 0,name,relative_url,population,arrests,Los Angeles',New York City's,Chicago's,Los Angeles,New York City,Chicago
0,"Aberdeen, Monmouth",/database/pd-dept/aberdeen-monmouth,Not found,Not found,4,15,16,,,
1,"Absecon, Atlantic",/database/pd-dept/absecon-atlantic,Not found,Not found,0,9,13,,,
2,"Allendale, Bergen",/database/pd-dept/allendale-bergen,Not found,Not found,0,2,3,,,
3,"Allenhurst, Monmouth",/database/pd-dept/allenhurst-monmouth,Not found,Not found,0,4,4,,,
4,"Allentown, Monmouth",/database/pd-dept/allentown-monmouth,Not found,Not found,0,1,1,,,
...,...,...,...,...,...,...,...,...,...,...
542,"Knowlton Township, Warren",/database/pd-dept/knowlton-township-warren,Not found,Not found,,,,Not found,Not found,Not found
543,"Liberty Township, Warren",/database/pd-dept/liberty-township-warren,Not found,Not found,,,,Not found,Not found,Not found
544,"Oxford, Warren",/database/pd-dept/oxford-warren,Not found,Not found,,,,Not found,Not found,Not found
545,"White, Warren",/database/pd-dept/white-warren,Not found,Not found,,,,Not found,Not found,Not found


***

Restarting from `src/tools/scraper.py`

In [1]:
import pandas as pd
from selenium import webdriver
from pathlib import Path
from datetime import datetime as dt 
today = dt.today().strftime("%Y-%m-%d")

# THIS_FILE = Path(__file__)
# THIS_DIR = THIS_FILE.parent
# CHROMEDRIVER_PATH = THIS_DIR.joinpath("./../../notebooks/chromedriver")
# INTERIM_DATA = THIS_DIR.joinpath("./../../data/interim/")
# PROCESSED_DATA = THIS_DIR.joinpath("./../../data/processed/")

BASE_URL = "http://force.nj.com"

In [2]:
browser = webdriver.Chrome("./chromedriver")
# Compile list police dept of options from dropdown menu
browser.get(BASE_URL)

police_depts_list = browser.find_element_by_id("mylist")
police_depts = police_depts_list.find_elements_by_tag_name("option")

In [3]:
depts_list = []
for dept in police_depts:
    dept_info = {}
    dept_info['name'] = dept.get_attribute("innerHTML").strip()
    dept_info['relative_url'] = dept.get_attribute("value")
    depts_list.append(dept_info)

In [4]:
def get_likelihood_numbers(div_name = 'left'):
    """Retrieves 'likelihood of use of force' numbers from force.nj.com website. These numbers are in 2 divs inside the 'race_breakdown' div.

    Parameters
    ----------
    div_name : str, optional
        Either the left or the right div, by default 'left'

    Returns
    -------
    touple : touple
        A touple containing the property name and likelihood number (text) for either the 'left' or 'right' div.
    """
    race_breakdown = browser.find_element_by_class_name("racial_breakdown")
    div = race_breakdown.find_element_by_class_name(div_name)
    based_on = div.find_element_by_class_name("important_num2").get_property("innerHTML").strip().replace(",", "")
    likelihood = div.find_element_by_class_name("important_num1").get_property("innerHTML").strip().replace(",", "")
    return (based_on, likelihood)

In [5]:
def get_flagging_officer_numbers():
    """Retrieve the 'number of officers that would be flagged in other agencies' from force.nj.com website. These numbers are inside the 'earlywarning' div.

    Returns
    -------
    touple : touple
        A six-part touple containing the city name and number for Los Angeles, New York City, and Chicago.
    """
    early_warning = browser.find_element_by_class_name("earlywarning")
    elements = early_warning.find_elements_by_class_name("important_num_red2")
    if len(elements) == 6:
        first_city = elements[0].get_property("innerHTML").strip().replace("'s", "").replace("'", "")
        first_city_n = elements[1].get_property("innerHTML").strip()
        second_city = elements[2].get_property("innerHTML").strip().replace("'s", "").replace("'", "")
        second_city_n = elements[3].get_property("innerHTML").strip()
        third_city = elements[4].get_property("innerHTML").strip().replace("'s", "").replace("'", "")
        third_city_n = elements[5].get_property("innerHTML").strip()
    return (first_city, first_city_n, second_city, second_city_n, third_city, third_city_n)

In [6]:
for dept in depts_list[:15]:
    # go to page
    browser.get(BASE_URL + dept['relative_url'])

    print(f"Getting info for {dept['name']}")
    
    # get likelihood numbers
    # by population
    try:
        (prop, number) = get_likelihood_numbers('left')
        dept[prop] = number
    except:
        dept['population'] = 'Not found'
    # by arrests
    try:
        (prop, number) = get_likelihood_numbers('right')
        dept[prop] = number
    except:
        dept['arrests'] = 'Not found'
    
    # get no of officers that would be flagged in other cities
    try:
        (first_city, first_city_n, second_city, second_city_n, third_city, third_city_n) = get_flagging_officer_numbers()
        dept[first_city] = first_city_n
        dept[second_city] = second_city_n
        dept[third_city] = third_city_n
    except:
        dept['Los Angeles'] = 'Not found'
        dept['New York City'] = 'Not found'
        dept['Chicago'] = 'Not found'
 

Getting info for Aberdeen, Monmouth
Getting info for Absecon, Atlantic
Getting info for Allendale, Bergen
Getting info for Allenhurst, Monmouth
Getting info for Allentown, Monmouth
Getting info for Alpine, Bergen
Getting info for Andover Township, Sussex
Getting info for Asbury Park, Monmouth
Getting info for Atlantic City, Atlantic
Getting info for Atlantic Highlands, Monmouth
Getting info for Audubon, Camden
Getting info for Avalon, Cape May
Getting info for Avon-By-the-Sea, Monmouth
Getting info for Barnegat, Ocean
Getting info for Barrington, Camden


In [7]:
data = pd.DataFrame(depts_list)
data['full_url'] = BASE_URL + data['relative_url']

In [8]:
data

Unnamed: 0,name,relative_url,population,arrests,Los Angeles,New York City,Chicago,full_url
0,"Aberdeen, Monmouth",/database/pd-dept/aberdeen-monmouth,294%,42%,Not found,Not found,Not found,http://force.nj.com/database/pd-dept/aberdeen-...
1,"Absecon, Atlantic",/database/pd-dept/absecon-atlantic,583%,68%,0,9,13,http://force.nj.com/database/pd-dept/absecon-a...
2,"Allendale, Bergen",/database/pd-dept/allendale-bergen,,8%,0,2,3,http://force.nj.com/database/pd-dept/allendale...
3,"Allenhurst, Monmouth",/database/pd-dept/allenhurst-monmouth,,,0,4,4,http://force.nj.com/database/pd-dept/allenhurs...
4,"Allentown, Monmouth",/database/pd-dept/allentown-monmouth,,,0,1,1,http://force.nj.com/database/pd-dept/allentown...
...,...,...,...,...,...,...,...,...
542,"Knowlton Township, Warren",/database/pd-dept/knowlton-township-warren,,,,,,http://force.nj.com/database/pd-dept/knowlton-...
543,"Liberty Township, Warren",/database/pd-dept/liberty-township-warren,,,,,,http://force.nj.com/database/pd-dept/liberty-t...
544,"Oxford, Warren",/database/pd-dept/oxford-warren,,,,,,http://force.nj.com/database/pd-dept/oxford-wa...
545,"White, Warren",/database/pd-dept/white-warren,,,,,,http://force.nj.com/database/pd-dept/white-warren


In [None]:
#data.to_csv(INTERIM_DATA / f'rough-scrape-data-{today}.csv', encoding = 'utf-8', index = False)