# scraping force.nj.com
#### Scraping the force.nj.com website for 2 statistics for each of the 468 police departments.

<details>
    <summary><strong>Goal</strong></summary>
    "The goal of this notebook is to ..."
    <ul>
        <li> measurable goals for this notebook </li>
        <li> Ex 1: Identify <i>variables of interest in dataset <strong>name.dta</strong></i> </li>
        <li> Ex 2: Create a <i>working dataset</i> from raw data.</li>
    </ul>
</details>

<details>
    <summary><strong>Context</strong></summary>
    Context at the moment (i.e. "We've downloaded raw data from <strong><i>website.com</i></strong> and are now in the process of creating a mastefile.")
    <li> What is the final goal of this project?</li>
    <li> What are we trying to recreate? Where are we in the process?</li>
    <li> Any links to documentation / figures.</li>
</details>

In [37]:
import pandas as pd
import requests
from pathlib import Path
from tools import tree
from datetime import datetime as dt
from bs4 import BeautifulSoup
today = dt.today().strftime("%d-%b-%y")

today

'12-Jun-20'

In [38]:
RAW_DATA = Path("../data/raw/")
INTERIM_DATA = Path("../data/interim/")
PROCESSED_DATA = Path("../data/processed/")
FINAL_DATA = Path("../data/final/")

In [39]:
tree(RAW_DATA)

+ ../data/raw


In [51]:
BASE_URL = "http://force.nj.com"

In [52]:
test_url = BASE_URL + "/database/pd-dept/brick-ocean"

In [53]:
r = requests.get(test_url)

In [54]:
soup = BeautifulSoup(r.content)

In [55]:
datalist = soup.findAll("datalist")[0]

In [56]:
tst = datalist.findAll("option")[0]

In [57]:
tst.attrs['value']

'/database/pd-dept/aberdeen-monmouth'

In [58]:
tst.text.strip()

'Aberdeen, Monmouth'

In [59]:
data = []
for option in datalist.findAll("option"):
    depts = {}
    depts['name'] = option.text.strip()
    depts['url'] = option.attrs['value']
    data.append(depts)

In [60]:
df = pd.DataFrame(data)

In [61]:
df['full_url'] = 'http://force.nj.com' + df['url']

In [62]:
df.head()

Unnamed: 0,name,url,full_url
0,"Aberdeen, Monmouth",/database/pd-dept/aberdeen-monmouth,http://force.nj.com/database/pd-dept/aberdeen-...
1,"Absecon, Atlantic",/database/pd-dept/absecon-atlantic,http://force.nj.com/database/pd-dept/absecon-a...
2,"Allendale, Bergen",/database/pd-dept/allendale-bergen,http://force.nj.com/database/pd-dept/allendale...
3,"Allenhurst, Monmouth",/database/pd-dept/allenhurst-monmouth,http://force.nj.com/database/pd-dept/allenhurs...
4,"Allentown, Monmouth",/database/pd-dept/allentown-monmouth,http://force.nj.com/database/pd-dept/allentown...


In [95]:
def get_likelihood_numbers(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content)
    
    try:
        important_nums = soup.find_all("section", "racial_breakdown")[0].find_all("div", "important_num1")
    except:
        important_nums = f"Error found for {url}"
    
    return important_nums

In [96]:
important_numbers = df['full_url'].apply(get_likelihood_numbers)

In [105]:
important_numbers.apply(type).value_counts(dropna=False)

<class 'bs4.element.ResultSet'>    461
<class 'str'>                       86
Name: full_url, dtype: int64

In [106]:
def extract_numbers(row):
    if type(row) == "<class 'str'>":
        pass
    else:
        numbers = []
        for element in row:
            print(row)
            numbers.append(element.strip())
    if numbers:
        return numbers
    else:
        return row

In [107]:
important_numbers.apply(extract_numbers)

[<div class="important_num1">


              294%

             

            </div>, <div class="important_num1">


              42%

             

            </div>]


TypeError: 'NoneType' object is not callable

In [112]:
for number in important_numbers:
    print(number)

[<div class="important_num1">


              294%

             

            </div>, <div class="important_num1">


              42%

             

            </div>]
[<div class="important_num1">


              583%

             

            </div>, <div class="important_num1">


              68%

             

            </div>]
[<div class="important_num1">

            N/A

            </div>, <div class="important_num1">

               8%

            </div>]
[<div class="important_num1">

            N/A

            </div>, <div class="important_num1">

            N/A

            </div>]
[<div class="important_num1">

            N/A

            </div>, <div class="important_num1">

            N/A

            </div>]
[<div class="important_num1">

            N/A

            </div>, <div class="important_num1">

            N/A

            </div>]
[<div class="important_num1">

            N/A

            </div>, <div class="important_num1">


              2

In [113]:
df['important_numbers'] = important_numbers

In [137]:
clean_important_numbers = []
for element in important_numbers:
    if type(element) == str:
        clean_important_numbers.append([element])
    elif len(element) > 0:
        clean_important_numbers.append([el.text.strip() for el in element])
    else:
        clean_important_numbers.append(["No numbers found"])

In [139]:
df['important_numbers'] = clean_important_numbers

In [143]:
mask_2_n = df['important_numbers'].apply(len) == 2

In [145]:
df[mask_2_n]['important_numbers']

['294%', '42%']

In [155]:
df[['likelihood_force_by_pop', 'likelihood_force_by_arrests']] = pd.DataFrame(df[mask_2_n]['important_numbers'].to_list(), columns=['likelihood_force_by_pop','likelihood_force_by_arrests'], index = df[mask_2_n].index)

In [156]:
df

Unnamed: 0,name,url,full_url,important_numbers,likelihood_force_by_pop,likelihood_force_by_arrests
0,"Aberdeen, Monmouth",/database/pd-dept/aberdeen-monmouth,http://force.nj.com/database/pd-dept/aberdeen-...,"[294%, 42%]",294%,42%
1,"Absecon, Atlantic",/database/pd-dept/absecon-atlantic,http://force.nj.com/database/pd-dept/absecon-a...,"[583%, 68%]",583%,68%
2,"Allendale, Bergen",/database/pd-dept/allendale-bergen,http://force.nj.com/database/pd-dept/allendale...,"[N/A, 8%]",,8%
3,"Allenhurst, Monmouth",/database/pd-dept/allenhurst-monmouth,http://force.nj.com/database/pd-dept/allenhurs...,"[N/A, N/A]",,
4,"Allentown, Monmouth",/database/pd-dept/allentown-monmouth,http://force.nj.com/database/pd-dept/allentown...,"[N/A, N/A]",,
...,...,...,...,...,...,...
542,"Knowlton Township, Warren",/database/pd-dept/knowlton-township-warren,http://force.nj.com/database/pd-dept/knowlton-...,[Error found for http://force.nj.com/database/...,,
543,"Liberty Township, Warren",/database/pd-dept/liberty-township-warren,http://force.nj.com/database/pd-dept/liberty-t...,[Error found for http://force.nj.com/database/...,,
544,"Oxford, Warren",/database/pd-dept/oxford-warren,http://force.nj.com/database/pd-dept/oxford-wa...,[Error found for http://force.nj.com/database/...,,
545,"White, Warren",/database/pd-dept/white-warren,http://force.nj.com/database/pd-dept/white-warren,[Error found for http://force.nj.com/database/...,,


In [158]:
df[df['important_numbers'].apply(len) == 1].head(10)

Unnamed: 0,name,url,full_url,important_numbers,likelihood_force_by_pop,likelihood_force_by_arrests
211,"Logan, Gloucester",/database/pd-dept/logan-gloucester,http://force.nj.com/database/pd-dept/logan-glo...,[201%],,
235,"Marlboro, Monmouth",/database/pd-dept/marlboro-monmouth,http://force.nj.com/database/pd-dept/marlboro-...,[451%],,
320,"Perth Amboy, Middlesex",/database/pd-dept/perth-amboy-middlesex,http://force.nj.com/database/pd-dept/perth-amb...,[65%],,
365,"Salem City, Salem",/database/pd-dept/salem-city-salem,http://force.nj.com/database/pd-dept/salem-cit...,[112%],,
403,"Toms River, Ocean",/database/pd-dept/toms-river-ocean,http://force.nj.com/database/pd-dept/toms-rive...,[482%],,
418,"Wall, Monmouth",/database/pd-dept/wall-monmouth,http://force.nj.com/database/pd-dept/wall-monm...,[540%],,
460,"Hi-Nella, Camden",/database/pd-dept/hi-nella-camden,http://force.nj.com/database/pd-dept/hi-nella-...,[Error found for http://force.nj.com/database/...,,
461,"Lawnside, Camden",/database/pd-dept/lawnside-camden,http://force.nj.com/database/pd-dept/lawnside-...,[Error found for http://force.nj.com/database/...,,
462,"Pine Valley, Camden",/database/pd-dept/pine%20valley-camden,http://force.nj.com/database/pd-dept/pine%20va...,[Error found for http://force.nj.com/database/...,,
463,"Franklin, Hunterdon",/database/pd-dept/franklin-hunterdon,http://force.nj.com/database/pd-dept/franklin-...,[Error found for http://force.nj.com/database/...,,


In [174]:
mask_1n = (df['important_numbers'].apply(len) == 1) & (df['important_numbers'].apply(lambda x: x[0]).str.startswith("Error") == False) & (df['important_numbers'].apply(lambda x: x[0]).str.startswith("No") == False)

In [178]:
df.loc[mask_1n, 'likelihood_force_by_pop'] = df[mask_1n]['important_numbers'].apply(lambda x: x[0])

In [180]:
df[mask_1n]

Unnamed: 0,name,url,full_url,important_numbers,likelihood_force_by_pop,likelihood_force_by_arrests
211,"Logan, Gloucester",/database/pd-dept/logan-gloucester,http://force.nj.com/database/pd-dept/logan-glo...,[201%],201%,
235,"Marlboro, Monmouth",/database/pd-dept/marlboro-monmouth,http://force.nj.com/database/pd-dept/marlboro-...,[451%],451%,
320,"Perth Amboy, Middlesex",/database/pd-dept/perth-amboy-middlesex,http://force.nj.com/database/pd-dept/perth-amb...,[65%],65%,
365,"Salem City, Salem",/database/pd-dept/salem-city-salem,http://force.nj.com/database/pd-dept/salem-cit...,[112%],112%,
403,"Toms River, Ocean",/database/pd-dept/toms-river-ocean,http://force.nj.com/database/pd-dept/toms-rive...,[482%],482%,
418,"Wall, Monmouth",/database/pd-dept/wall-monmouth,http://force.nj.com/database/pd-dept/wall-monm...,[540%],540%,


In [181]:
df.to_csv(INTERIM_DATA / 'draft-data.csv', encoding = 'utf-8', index = False)

In [182]:
df.head()

Unnamed: 0,name,url,full_url,important_numbers,likelihood_force_by_pop,likelihood_force_by_arrests
0,"Aberdeen, Monmouth",/database/pd-dept/aberdeen-monmouth,http://force.nj.com/database/pd-dept/aberdeen-...,"[294%, 42%]",294%,42%
1,"Absecon, Atlantic",/database/pd-dept/absecon-atlantic,http://force.nj.com/database/pd-dept/absecon-a...,"[583%, 68%]",583%,68%
2,"Allendale, Bergen",/database/pd-dept/allendale-bergen,http://force.nj.com/database/pd-dept/allendale...,"[N/A, 8%]",,8%
3,"Allenhurst, Monmouth",/database/pd-dept/allenhurst-monmouth,http://force.nj.com/database/pd-dept/allenhurs...,"[N/A, N/A]",,
4,"Allentown, Monmouth",/database/pd-dept/allentown-monmouth,http://force.nj.com/database/pd-dept/allentown...,"[N/A, N/A]",,


In [185]:
mask_likelihood_empty = (df['likelihood_force_by_arrests'].isnull()) & (df['likelihood_force_by_pop'].isnull())

In [194]:
df.loc[mask_likelihood_empty, 'no_data_found'] = 'No data found'

In [195]:
df.tail()

Unnamed: 0,name,url,full_url,important_numbers,likelihood_force_by_pop,likelihood_force_by_arrests,No data,no_data_found
542,"Knowlton Township, Warren",/database/pd-dept/knowlton-township-warren,http://force.nj.com/database/pd-dept/knowlton-...,[Error found for http://force.nj.com/database/...,,,No data found,No data found
543,"Liberty Township, Warren",/database/pd-dept/liberty-township-warren,http://force.nj.com/database/pd-dept/liberty-t...,[Error found for http://force.nj.com/database/...,,,No data found,No data found
544,"Oxford, Warren",/database/pd-dept/oxford-warren,http://force.nj.com/database/pd-dept/oxford-wa...,[Error found for http://force.nj.com/database/...,,,No data found,No data found
545,"White, Warren",/database/pd-dept/white-warren,http://force.nj.com/database/pd-dept/white-warren,[Error found for http://force.nj.com/database/...,,,No data found,No data found
546,New Jersey State Police (NJSP),/database/pd-dept/njsp,http://force.nj.com/database/pd-dept/njsp,[No numbers found],,,No data found,No data found


In [196]:
df.columns

Index(['name', 'url', 'full_url', 'important_numbers',
       'likelihood_force_by_pop', 'likelihood_force_by_arrests', 'No data',
       'no_data_found'],
      dtype='object')

In [197]:
voi = [
    'name', 
    'url', 
    'full_url', 
    'likelihood_force_by_pop', 
    'likelihood_force_by_arrests', 
    'no_data_found'
]

In [205]:
final_df = df[voi]

In [206]:
final_df.columns = ['dept_name', 'relative_url', 'full_url', 'likelihood_force_by_pop', 'likelihood_force_by_arrests', 'no_data_found']

In [208]:
final_df.to_csv(PROCESSED_DATA / f"force-nj-com-{today}.csv", encoding = 'utf-8', index = False)