In [None]:
import pandas as pd
from dateutil import parser
from geopy import Nominatim
import numpy as np
from numpy.linalg import norm
from time import sleep

import glob
import os

geolocator = Nominatim()

# used a geocoder to translate street addresses to coordinates
def locToCoords(location):
    sleep(1.1)
    coords = geolocator.geocode(location + " Charlottesville")
    if coords is not None and coords.longitude is not None:
        longitude = coords.longitude
        latitude = coords.latitude
        return(latitude, longitude)
    return("Did not process")

# Mall coordinates -  (38.031233, -78.482905) (38.029413, -78.477146)
def distToMall(posTuple):
    lat = posTuple[0]
    long = posTuple[1]
    p1 = np.array([38.031233, -78.482905])
    p2 = np.array([38.029413, -78.477146])
    p3 = np.array([lat, long])
    return norm(np.cross(p2 - p1, p1 - p3)) / norm(p2 - p1)

# pandas apply didn't want to work with a dataframe
# so this was my workaround don't judge pls
def dict_to_coords(value):
    try:
        return locdict[value]
    except:
        return np.nan

In [None]:
# loading data, normal stuff. Dropping duplicates based on location to reduce number of queries
ticket_path = os.path.join('..','data','parking_tickets','Parking_Tickets.csv')
data = pd.read_csv(ticket_path, low_memory=0)
data = data[data['DateIssued'].str.contains('2017')].sort_values(by=['DateIssued'])
no_dup_data = data.drop_duplicates(['Location'])

In [None]:
# querying the geocoder and saving the result
no_dup_data['Coords'] = no_dup_data["Location"].apply(locToCoords)
no_dup_data.to_csv('with_coords.csv')

In [None]:
# turn these two columns into a dict so we can apply to the duplicates
locdict = no_dup_data.set_index('Location')['Coords'].to_dict()

In [None]:
# filter out values that the geocoder couldn't parse
data = data[data['Location'].isin(locdict.keys())]

In [None]:
# apply the dict to the full list with duplicates
data['Coords'] = data['Location'].apply(dict_to_coords)
data = data[data['Coords'] != np.nan]
# distance squared, clipped at .025, which is right on the mall (getting rid of crazy low values)
data['DistMetric'] = data['Coords'].apply(distToMall).apply(lambda x: max(x**2 * 10**5, .025))

In [83]:
# save data
data.to_csv('ticket_with_loc.csv')