In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#sklearn libraries
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict

from catboost import Pool, CatBoostRegressor # pip install catboost

import json
import math
from tqdm import tqdm

In [2]:
#haversine formula is simpliest distance between two lat/lon.  Assumes no elevation changes
def distance(lat1,lon1,lat2,lon2):
    #lon1,lat1=coord1 #for reference only
    #lon2,lat2=coord2 #for reference only

    R = 6371000                               # radius of Earth in meters
    phi_1 = math.radians(lat1)
    phi_2 = math.radians(lat2)

    delta_phi = math.radians(lat2-lat1)
    delta_lambda = math.radians(lon2-lon1)

    a = math.sin(delta_phi/2.0)**2+\
        math.cos(phi_1)*math.cos(phi_2)*\
        math.sin(delta_lambda/2.0)**2
    c = 2*math.atan2(math.sqrt(a),math.sqrt(1-a))

    meters = R*c                         # output distance in meters
    km = meters/1000.0              # output distance in kilometers
    miles = meters*0.000621371      # output distance in miles
    feet = miles*5280               # output distance in feet
    
    return round(miles,2)

In [3]:
crime = pd.read_csv("Crime_Data_2016.csv")

In [4]:
crime.head()

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Weapon Description,Status Code,Status Description,Crime Code 1,Crime Code 2,Crime Code 3,Crime Code 4,Address,Cross Street,Location
0,161804051,01/02/2016,01/02/2016,1325,18,Southeast,1804,310,BURGLARY,0344,...,,AA,Adult Arrest,310,,,,MCKINLEY,MANCHESTER AV,"(33.9602, -118.261)"
1,161704085,01/02/2016,01/02/2016,1400,17,Devonshire,1764,442,SHOPLIFTING - PETTY THEFT ($950 & UNDER),0325,...,,JA,Juv Arrest,442,,,,9300 TAMPA AV,,"(34.244, -118.5583)"
2,161304073,01/02/2016,01/01/2016,2330,13,Newton,1322,210,ROBBERY,0326 1309 0302 0334 0916 1311 0330 0305 0355 0344,...,SEMI-AUTOMATIC PISTOL,AA,Adult Arrest,210,,,,2400 TRINITY ST,,"(34.0242, -118.2623)"
3,161504099,01/02/2016,01/02/2016,1145,15,N Hollywood,1514,310,BURGLARY,1607 0344 0352 1402 0321 0216 1221 1403 1420,...,,AO,Adult Other,310,998.0,,,12000 VANOWEN ST,,"(34.194, -118.3943)"
4,161504055,01/02/2016,12/31/2015,1935,15,N Hollywood,1522,649,DOCUMENT FORGERY / STOLEN FELONY,0100 1402,...,,AA,Adult Arrest,649,,,,13000 VICTORY BL,,"(34.1936, -118.4166)"


In [5]:
crime.columns

Index(['DR Number', 'Date Reported', 'Date Occurred', 'Time Occurred',
       'Area ID', 'Area Name', 'Reporting District', 'Crime Code',
       'Crime Code Description', 'MO Codes', 'Victim Age', 'Victim Sex',
       'Victim Descent', 'Premise Code', 'Premise Description',
       'Weapon Used Code', 'Weapon Description', 'Status Code',
       'Status Description', 'Crime Code 1', 'Crime Code 2', 'Crime Code 3',
       'Crime Code 4', 'Address', 'Cross Street', 'Location '],
      dtype='object')

In [6]:
crime['Location '][0]

'(33.9602, -118.261)'

In [7]:
# ONLY RUN THIS ONCE
crime['lat'], crime['lon'] = crime['Location '].str.split(',', 1).str
crime['lat'] = crime['lat'].map(lambda x: str(x)[1:])
crime['lon'] = crime['lon'].map(lambda x: str(x)[:-1])

In [8]:
crime.lat = crime.lat.astype('float64')
crime.lon = crime.lon.astype('float64')

In [9]:
print (crime.lat.head()) 
print (crime.lon.head())

0    33.9602
1    34.2440
2    34.0242
3    34.1940
4    34.1936
Name: lat, dtype: float64
0   -118.2610
1   -118.5583
2   -118.2623
3   -118.3943
4   -118.4166
Name: lon, dtype: float64


In [10]:
distance(crime.lat[0], crime.lon[0], crime.lat[1], crime.lon[1])

25.96

## Now let's try to see how long to check distances of a waypoint for all ~200k rows.

In [11]:
crime['waylat'] = crime.lat[0]
crime['waylon'] = crime.lon[0]

In [12]:
crime.waylat.head()

0    33.9602
1    33.9602
2    33.9602
3    33.9602
4    33.9602
Name: waylat, dtype: float64

In [15]:
crime['dist'] = crime.apply(lambda row: distance(row['waylat'], row['waylon'], 
                                                 row['lat'], row['lon']), axis=1)

In [16]:
#what do we get
crime.dist.head()

0    67.9204
1    68.2042
2    67.9844
3    68.1542
4    68.1538
dtype: float64

## Timing calculation

In [20]:
%timeit crime.apply(lambda row: distance(row['waylat'], row['waylon'], row['lat'], row['lon']), axis=1)

12.7 s ± 435 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
#how many rows again?
crime.shape

(225350, 31)

In [35]:
print (round(12.7/225350, 5), 'sec / row (linear assertion)')
print ('16k row calculation of a single waypoit:', round(16000*12.7/225350,2), 
       'seconds wait per waypoint')

6e-05 sec / row (linear assertion)
16k row calculation of a single waypoit: 0.9 seconds wait per waypoint


## Conclusion

Calculating waypoints for an individual is possible with a processing time of approx 1 sec per waypoint.  I think this is acceptable if the input is in the profile, but perhaps an annoyance if the input is as a filter.  There is a natural delay from submit to exploration which we can hide this within.  It's also on the long end of acceptable (annoying) if we put this feature out front.