## This code is designed to scrape data from the [Crime Reports](https://www.crimereports.com/) webpage. 

At present it is configured to scrape data from the Victoria Police Department's Crime Reports webpage.  But it can be easily configured to scrape from any municipality that uses the service. 

You will likely need to install some libraries (all available with pip) to run this code.  Simply run and check.  You will need to install Google's "Crome Driver" to make relatively easy use of this library.  You can use other webdrivers, but you will likely need to configure a few (mostly minor) details to get other browsers working for you.  For example, Firefox works, but you need to change the date formatting strings. 

[Google's chromedriver repository](http://chromedriver.storage.googleapis.com/index.html?path=2.9/)

The final block of this code has the most useful function call.  Please follow the instructions in the code to avoid problems.  A full download of 6-months worth of data is available in [this directory](dl.14.Sep.2015-11.Mar.2016.json.bz2).  Once you have decompressed it with [bunzip2](http://gnuwin32.sourceforge.net/packages/bzip2.htm) you will have a text file readable by the simplejson library.  See the last block for an explanation of what the text file contains precisely.  bzip2 is available with apt-get, or synaptic. 

##TODO: it looks like I've done some unsafe dict operations.  Check for existence of keys
##before referring to them

In [None]:
import html2text as h2t
import string, os, re

## so let's make a routine to find the first string in oru list containing a substring...
def index_of_substring(the_list, substring):
    for i, s in enumerate(the_list):
        if substring in s:
              return i
    return -1

## returns string with html tags removed.  Might fail on "complex" or poorly
## formatted html, but mostly works well. 
def stripHTML(inp):
    return re.sub('<[^<]+?>', '', inp)

In [None]:
from datetime import datetime, date

def processDownloads(textStr1, textStr2):
    ## convert strings to lists of lines
    splitString = textStr1.split(os.linesep)
    splitString2 = textStr2.split(os.linesep)
    
    ## get rid of junk at start and end.
    workIdx1 = index_of_substring(splitString, '<!-- ngRepeat: incident in incidents')
    if (workIdx1 < 0): 
        print("Error (1)")
    splitString=splitString[workIdx1:]
    ## okay now the items appear, so remove end junk
    splitString = splitString[0:len(splitString)-12]

    lineNo = 0
    workIdx2 = index_of_substring(splitString2, '<table>')
    if (workIdx2 < 0): 
        print("Error (1)")
        return []
    splitString2 = splitString2[workIdx2:] ## throw away previous
    lineNo += workIdx2 ## keep track of the line we are on
    splitString2 = splitString2[2:]; lineNo+=2
    ## let's do a sanity check. 
    heads = []
    for i in range(5):
        wk = splitString2[0].strip()
        if ((wk.find('<td style="width: 150px;">')!=0) and 
            (wk.find('<td style="width: 250px;">')!=0) and
            (wk.find('<td style="max-width: 200px; ')!=0) and
            (wk.find('<td style="max-width: 250px;')!=0) ): 
                print("Error (2)")
                return []
        wk = stripHTML(wk)
        heads.append(wk)
        splitString2=splitString2[1:]; lineNo+=1;
    splitString2 = splitString2[3:]; lineNo+=3; ## skip ahead to the first record.
    ## split off the junk at the end. 
    splitString2 = splitString2[0:len(splitString)-5]
    ## now both splitString and splitString2 only contain records.
    
    retval1 = []
    while (len(splitString2)>6):
        dictO = []
        ## address, crime type, date/time, desc, ID, desc, unit, no info
        for i in range(5):
            if (i==4): ## check for <b>
                wk = splitString2[0]
                j = wk.find('<b>')
                k = wk.find('</b>')
                if (j>0): wk = wk[j+3:k]
                else: wk = ""
            else:
                wk = splitString2[0].strip()
                wk = stripHTML(wk)
                ## we need to turn "&amp;" to "&"
                wk = wk.replace("&amp;", "&")
            dictO.append( (heads[i], wk) ) ## to do, format date?
            splitString2=splitString2[1:]; lineNo+=1;
        retval1.append(dict(dictO))
        splitString2=splitString2[2:]  

    ## todo: retval1 has less records.  Let's go through splitString and 
    ##  append these records to retval1.  Any additional records we will 
    ##  split-off in a second return value

    count = 0
    retval2 = []
    while len(splitString)>0:
        ## Type, date, meaningless string, address, send to friend, gps. 
        ## 6 elements per record.
        if (count%6==0): dict0=list([])
        wk = splitString[0].strip()
        if (count%6==0): 
            wk = re.findall(r"[-+]?\d*\.\d+|\d+", wk)
        else: 
            wk = stripHTML(wk)
            wk = wk.replace("&amp;", "&")
        if (count%6==0): 
            dict0.append( ("GPS", wk) )
        elif (count%6==1): dict0.append( ("Crime Type", wk ) )
        elif (count%6==2): 
            dict0.append( ("Date", wk ) )
        elif (count%6==4): dict0.append( ("Address", wk ) )
        count+=1
        splitString=splitString[1:]
        ## check to see if record is processed
        if (count%6==0):
            dict0 = dict(dict0)
            ## let's check to see if this record is in  retval1
            I = next( (i for i in range(len(retval1)) if \
                       (retval1[i]['Crime Type']==dict0['Crime Type'] and \
                        (datetime.strptime(retval1[i]['Date/Time'], "%d-%b-%Y %H:%M %p").date() == \
                         datetime.strptime(dict0['Date'], "%d-%b-%Y").date()) and \
                        retval1[i]['Address']==dict0['Address'])), -1 )
            if (I==-1):
                retval2.append(dict0)
            else:
                retval1[I]['GPS'] = dict0['GPS']
                
    return retval1, retval2

In [None]:
import os, time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as selEC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from pyvirtualdisplay import Display
from datetime import date

## startdate and end date are python date formats, from datetime.date
## startdate and enddate must be less than or equal to 30 days apart. 
## you must also pass the driver you are using,
def grabDateRange(startdate, enddate, driver):
    searchlabel1 = "advancedSearchNavButton"
    #pressbuttons = ["showMoreIncidentTypesCheckbox", "incidentType104", "incidentType170", "incidentType8", 
    #                "incidentType16", "incidentType15", "sexOffenderCheckbox", "incidentType179", 
    #                "incidentType178", "incidentType162", "incidentType164", "incidentType165", 
    #                "incidentType167"]
    ## load up big menu
    time.sleep(1)
    driver.find_element_by_id(searchlabel1).click()
    ## make sure everything is selected
    time.sleep(3)
    driver.find_element_by_xpath('//*[@id="alert"]/div[2]/div[1]/a[1]').click()
    #for x in pressbuttons:
    #    driver.find_element_by_id(x).click()
    ## choose dates
    ## let's convert the start and end dates input to [day mon, year] format
    elemt = driver.find_element_by_id("date-from")
    time.sleep(1)
    for i in range(14):
        elemt.send_keys("\b")
    elemt.send_keys(startdate.strftime("%d %b, %Y"))
    elemt = driver.find_element_by_id("date-to")
    time.sleep(1)
    for i in range(14):
        elemt.send_keys("\b")
    elemt.send_keys(enddate.strftime("%d %b, %Y"))
    ## load it up!
    driver.find_element_by_xpath('//button[contains(@class,"btn-action") and text()="Show Incidents"]').click()
    time.sleep(14)
    driver.find_element_by_xpath('//div[contains(@class,"gControl") and text()="Show Details"]').click()
    time.sleep(8)
    elemt = driver.find_element_by_id("crCrimeListContainer")
    retval1 = elemt.get_attribute('innerHTML')
    ## this is the data with the GPS coordinates.  We should also grab the data with the crime ID number
    ## and the "Description" tag which indicates if criminal charges files, and whether or not proceedings
    ## have concluded. 
    time.sleep(4)
    driver.find_element_by_class_name("crimeListViewPrintLink").click()
    ## new window is opening, let's wait for it to open.
    time.sleep(8)
    WebDriverWait(driver, 6).until(lambda d: len(d.window_handles) == 2)
    driver.switch_to_window(driver.window_handles[1])
    time.sleep(3)
    elemt = WebDriverWait(driver, 6).until(selEC.presence_of_element_located( \
            (By.XPATH,'//div[contains(@class,"content")]') ))
    retval2 = elemt.get_attribute('innerHTML')
    oldhandle = driver.window_handles[0]
    driver.close()
    driver.switch_to_window(oldhandle)
    driver.find_element_by_xpath('//*[@id="crCrimeListContainer"]/span/img').click()
    return processDownloads(retval1, retval2)

## Alright, we've fully automated recovering the basic data, but retval1 has 
## GPS coordinates and.. retval2 has the "description" tag which sometimes has
## useful information.  We should strip this as well. 

In [None]:
## Algorithm that takes two dates, breaks them up into 30-day + remainder ranges, passes back that list.
## the return value is a list of pairs. 
from datetime import timedelta
def thirtyDayDivision(startdate, enddate):
    delta = enddate - startdate
    days = delta.days + 1
    periods = days//30
    extra = days % 30
    print("Periods ", periods, " extra ", extra)
    retval = []
    for i in range(periods):
        retval.append( (startdate, startdate+timedelta(days=29) ) )
        startdate = startdate + timedelta(days=30)
    if (extra != 0):
        retval.append( (startdate, startdate+timedelta(days=extra-1) ) )
    return retval

## Grabs an arbitrary-timespan from the vicPD webpage.  start and end dates
## must be valid dates (at present you need to log into the webpage to find
## out what those can be... I will automate this eventually).  silent is a bool, 
## True and the window for the browser is suppressed, False and you will
## see your computer typing everything into your browser. 
def grabBigRange(startdate, enddate, silent):
    dateDiv = thirtyDayDivision( startdate, enddate )
    print("Attempting to download date range: ", startdate, "--", enddate, " in ",len(dateDiv), " parcels.\n", flush=True)
    
    chromedriver = "/home/rybu/prog/chromedriver"  ## location of the chromedriver binary
    os.environ["webdriver.chrome.driver"] = chromedriver
    if silent==True:
        display = Display(visible=0)
        display.start()
    ## choose which knid of driver you want to use.  Firefox seems to come with a webdriver
    ## installed automatically, Google Chrome requires you to install one yourself.  The Chrome
    ## driver seems faster and easier-to-use to me.  If you choose to use Firefox you will likely
    ## need to edit the format you enter your dates. 
    driver = webdriver.Chrome(chromedriver)
    #driver = webdriver.Firefox()
    vicPDwebpage = "https://www.crimereports.com/agency/vicpdcanada"
    driver.get(vicPDwebpage)
    
    retval1 = []
    retval2 = []
    for i in range(len(dateDiv)):
        print("Downloading parcel ", i+1, " ", dateDiv[i][0], "to", dateDiv[i][1], end="", flush=True)
        L1, L2 = grabDateRange( dateDiv[i][0], dateDiv[i][1], driver )
        print(".", flush=True)
        retval1.extend(L1)
        retval2.extend(L2)
    
    driver.quit()
    if silent==True:
        display.stop()

    return retval1, retval2

In [None]:
## The webpage seems to only have a few months of data available at any given
## time.  When I write this, on March 12th, 2016, the webpage only has data availble
## from September 14th 2015 until March 11th, 2016.  So check what is available
## before you use the code.  Perhaps we can automate this...  Perhaps they've 
## trottled what I can access? Hmm... things to check.  Anyhow, at present, if you 
## give it valid start and end dates it will download all the data for those dates
## and save it in an appropriately-labelled json file.  Some records have missing data
## the json file has two arrays in it.  The first has entries with complete data, the
## second is missing a little bit -- time, the police ID number for the event, etc. 
import simplejson as json
def grabFrom(startDate, endDate):
    rv1, rv2 = grabBigRange(startDate, endDate, False) ## set the bool accordingly
    f=open("dl."+startDate.strftime("%d.%b.%Y")+"-"+endDate.strftime("%d.%b.%Y")+".json", "w")
    f.write(json.dumps(rv1)+"\n")
    f.write(json.dumps(rv2))
    f.close()
    

In [None]:
grabFrom(date(2015, 9,14), date(2016, 3, 11))