# Downloading and parsing US Capitol Police arrest reports

Thanks to Mike Stucka for this great tutorial
[this great tutorial on scraping with pyquery](https://github.com/PalmBeachPost/nicar19scraping/blob/master/00-Scraping%20--%20full%20self-tutorial.ipynb) ... yeah, I switched to bs4 but this got me started.

### Things used:
* [requests](https://2.python-requests.org/en/master/)
* [pdfplumber](https://github.com/jsvine/pdfplumber)
* [pandas](https://pandas.pydata.org)
* [beautifulsoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
* [datasette/csvs-to-sqlite](https://datasette.io)
* Error logger from @coworker_whose_github_i_couldnt_find
* 

### Here's where the arrest reports [live](https://www.uscp.gov/media-center/weekly-arrest-summary).

### To do:
* ~~Refine datasette:~~
    * ~~SQLite apparently infers that number an int, should stay string in case of leading zeros. (This apparently might not be possible.) Fixed w/ --shape.~~
* ~~Functionify dir creation~~
* ~~Make a new csv each time the script runs~~
* Implement emails w/ function
* YAML/cron

In [1]:
# Install dependencies
!pip3 install pdfplumber pyquery numpy pandas requests csvs-to-sqlite datasette beautifulsoup4



In [2]:
# External dependencies
import requests
import pdfplumber
import pandas as pd
from bs4 import BeautifulSoup
from Logger import Log

# Built-in dependencies
import csv
import re
import os
import glob
import datetime
import subprocess
import urllib
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError

In [3]:
# Set up logging
log = Log().getLogger()

In [4]:
# Set up some file structure to put PDFs and debug logs in

reports_dir = '/reports'
debug_dir = '/debug'

# Little function for making directories
def mkdir(d):
    wd = os.getcwd()
    print(f'The current working directory is {wd}.')

    if os.path.isdir(wd + d):
        print(f'The directory {d} already exists.')
    else:
        try:
            os.mkdir(wd + d)
        except OSError as e:
            log.debug(e)
            print(f"Creation of the directory {d} failed.")
        else:
            print(f"Successfully created the directory {d}.")

mkdir(reports_dir)
mkdir(debug_dir)

# wd = os.getcwd()
# print(f'The current working directory is {wd}.')

# reports_dir = '/reports'

# if os.path.isdir(wd + reports_dir):
#     print(f'The directory {reports_dir} already exists.')
# else:
#     try:
#         os.mkdir(wd + reports_dir)
#     except OSError as e:
#         log.debug(e)
#         print(f"Creation of the directory {reports_dir} failed.")
#     else:
#         print(f"Successfully created the directory {reports_dir}.")

# debug_dir = '/debug'

# if os.path.isdir(wd + debug_dir):
#     print(f'The directory {debug_dir} already exists.')
# else:
#     try:
#         os.mkdir(wd + debug_dir)
#     except OSError as e:
#         log.debug(e)
#         print(f"Creation of the directory {debug_dir} failed.")
#     else:
#         print(f"Successfully created the directory {debug_dir}.")

The current working directory is /Volumes/GoogleDrive/My Drive/data/ipynbs/uscp_arrests.
Successfully created the directory /reports.
The current working directory is /Volumes/GoogleDrive/My Drive/data/ipynbs/uscp_arrests.
The directory /debug already exists.


In [5]:
# Get the HTML for the page w/ some error handling
base_url = 'https://www.uscp.gov'
url = 'https://www.uscp.gov/media-center/weekly-arrest-summary'

try: 
    html = urlopen(url)
except HTTPError as e:
    log.debug(e)
    # Send a message somewhere
except URLError as e:
    log.debug(e)
    # Send a message somewhere
else:
    print(f'Success, {url} fetched.')

# Get the arrest report links and download the PDFs to the created directory
bs = BeautifulSoup(html.read(), 'html.parser')
link_list = bs.find_all('a', text= re.compile('Arrest Summary .+'))

if not link_list:
    print('No links found. Hmmm, maybe the URL changed ...') 
    # Saying this because bad URL slug ending still returned a page, just not the right one
    # Send a message somewhere
else:
    for link in link_list:
        try:
            href = link.attrs['href']
        except AttributeError as e:
            log.debug(e)
            # Send a message somewhere
        else:
            filename = '/' + href.rsplit('/', 1)[1].lower().replace('%20', '_')
            print('Downloading ' + filename)
            urllib.request.urlretrieve(base_url + href, wd + reports_dir + filename)

Success, https://www.uscp.gov/media-center/weekly-arrest-summary fetched.
Downloading /arrest_summary_12-20-18_1-2-19_1.pdf


NameError: name 'wd' is not defined

In [None]:
# Parse the downloaded PDFs
pdfs = glob.glob(wd + reports_dir + '/*')

report_list = []

for pdf in pdfs:
    plumb = pdfplumber.open(pdf)
    pages = plumb.pages # A list of PDF page objects
    
    pages_text = ''

    for page in pages:
        text = page.extract_text()
        pages_text += text
    
    pages_text = re.sub(r'(^\d\s*(\n|$))', '\n', pages_text, flags=re.M) # Get rid of the page numbers
    
    
    # Regex to find each arrest report chunk https://regex101.com/r/kWkaLi/7
    chunk = (
            r'((?:(?:.+\n)(?=(?:(?:\d{1,2}\/\d{1,2}\/\d{2,4})(?:\s+)(?:\d{1,2}:\d{1,2})(?:\s+)(?:\d{5,12}))))'
            r'(?:(?:\d{1,2}\/\d{1,2}\/\d{2,4})(?:\s+)(?:\d{1,2}:\d{1,2})(?:\s+)(?:\d{5,12}))'
            r'(?:(?:[\s\S]+?(?=(?:\Z)|(?:(?:(?:.+\n)(?=(?:(?:\d{1,2}\/\d{1,2}\/\d{2,4})(?:\s+)(?:\d{1,2}:\d{1,2})(?:\s+)(?:\d{5,12})))))))))'
    )
    
    reports = re.findall(chunk, pages_text, flags=re.M)
    
    for report in reports:
        report_list.append(report)
    
title = []
date = []
time = []
number = []
narrative = []

for report in report_list:
    report = report.strip() # Remove leading and trailing whitespace
    
    # Regex to slice up the different data points of each 'chunk'
    regex = r'(^.+\n)(?:(\d{1,2}\/\d{1,2}\/\d{2,4})(?:\s+)(\d{1,2}:\d{1,2})(?:\s+)(\d{5,12}))([\s\S]+)'

    titles = re.search(regex, report).group(1).strip()
    dates = re.search(regex, report).group(2).strip()
    times = re.search(regex, report).group(3).strip()
    numbers = re.search(regex, report).group(4).strip()
    narratives = re.sub('\n', '',(re.search(regex, report).group(5).strip()))

    title.append(titles)
    date.append(dates)
    time.append(times)
    number.append(numbers)
    narrative.append(narratives)

In [None]:
# Put it all in a pandas dataframe 
d = {
    'title': title,
    'date': date,
    'time': time,
    'number': number,
    'narrative': narrative
    }

df = pd.DataFrame(data = d)

df['datetime'] = df['date'].map(str) + ' ' + df['time'] # Merge the date and time columns
df['datetime'] = pd.to_datetime(df['datetime'], infer_datetime_format = True) # Make that new column a datetime type
df['date'] = df['datetime'].dt.date # Split off date
df['time'] = df['datetime'].dt.time # Split off time

df

In [None]:
# Make a CSV with datetime label that goes in the csv dir
dt = str(datetime.datetime.now().strftime('%Y%m%d_%H%M%S'))
csv_dir = '/csv'
csv_file = '/uscp_arrests_' + dt + '.csv'

mkdir(csv_dir)

df.to_csv(wd + csv_dir + csv_file, encoding='utf-8', index=False)
print(f'Saved the file {csv_file} to {csv_dir}.')

In [None]:
# Put it all in a datasette and publish 

db_dir = '/db'
csv = wd + csv_dir
db = wd + db_dir + '/uscp_arrests.db'

mkdir(db_dir)

# Running terminal commands from python
subprocess.check_call([
    'csvs-to-sqlite',
    '--replace-tables',
    '--shape',
    'title:title,date:date(TEXT),time:time(TEXT),number:number(TEXT),narrative:narrative,datetime:datetime(TEXT)',
    csv,
    db]) 
# ^^ Trixy ^^ any time you would have a space in the command line
# you need to comma separate and have a news string in the brackets.

print('Starting uscp_arrestes.db datasette at http://127.0.0.1:8001/ ...')
subprocess.check_call(['datasette', db])