# Calculate closest points against GNAF dataset

Downloaded from https://data.gov.au/dataset/geocoded-national-address-file-g-naf

Note: This example uses **GNAF 2018 AUGUST release**

## Notes:
- Takes in two datasets containing longitude and latitude
- Checks which point in dataset B is closest to point in dataset A on record level and appends data from dataset B to that record.
- Exports resulting dataframe.
- Uses `KDtree` algorithm and pooling from `multiprocessing` to speed up run time.

## Setup

### Import relevant modules

In [3]:
import pandas
import datetime
import numpy
import math
from scipy import spatial
import random
from multiprocessing import Pool

  return f(*args, **kwds)
  return f(*args, **kwds)


### Import / Create Test Data
We need to import several tables from GNAF here, the spine, the coordinates and the street names

In [1]:
p_address_detail = '/users/danielcorcoran/desktop/G-NAF AUGUST 2018/Standard/VIC_ADDRESS_DETAIL_psv.psv'
p_address_default_geocode = '/users/danielcorcoran/desktop/G-NAF AUGUST 2018/Standard/VIC_ADDRESS_DEFAULT_GEOCODE_psv.psv'
p_street_locality = '/users/danielcorcoran/desktop/G-NAF AUGUST 2018/Standard/VIC_STREET_LOCALITY_psv.psv'

In [4]:
address_detail = pandas.read_csv(p_address_detail, sep = "|", low_memory = False)
address_default_geocode = pandas.read_csv(p_address_default_geocode, sep = "|", low_memory = False)
street_locality = pandas.read_csv(p_street_locality, sep = "|", low_memory = False)

In [5]:
print(address_detail.shape)
print(address_default_geocode.shape)
print(street_locality.shape)

(3732245, 35)
(3732245, 7)
(188158, 11)


In [6]:
comparison_data_subset = pandas.merge(left=address_detail, 
                                      right = address_default_geocode, 
                                      how = 'left',
                                      on = 'ADDRESS_DETAIL_PID')

In [7]:
comparison_data = pandas.merge(left = comparison_data_subset,
                              right = street_locality,
                              on = 'STREET_LOCALITY_PID')

In [8]:
row_count = 50000

base_data = pandas.concat([pandas.Series([numpy.random.uniform(140,150) for n in range(row_count)]), 
                      pandas.Series([numpy.random.uniform(-32,-40) for n in range(row_count)])], 
                     axis = 1)
base_data.columns = ['longitude', 'latitude']

## Process 

### Create comparison points list

In [9]:
comparison_data['geopoint'] = list(zip(comparison_data['LONGITUDE'], comparison_data['LATITUDE']))
base_data['geopoint'] = list(zip(base_data['longitude'], base_data['latitude']))

### Create Spatial KDTree Object

In [10]:
import sys
sys.setrecursionlimit(10000)

In [11]:
tree = spatial.KDTree(list(comparison_data['geopoint']))

### Helper function to return shortest distance and index using KDTree defined above

In [12]:
def return_distance_index_closest_point(point):
    
    distance_index_tuple = tree.query(point)
    return distance_index_tuple

### Calculate results

In [13]:
%%time

pool = Pool(6)
results = list(pool.map(return_distance_index_closest_point, base_data['geopoint']))

print("Checked {:,} base points against {:,} comparison points. {:,} Iterations".format(base_data.shape[0],
                                 comparison_data.shape[0],
                                 base_data.shape[0] * comparison_data.shape[0]))

Checked 50,000 base points against 3,732,245 comparison points. 186,612,250,000 Iterations
CPU times: user 119 ms, sys: 136 ms, total: 255 ms
Wall time: 9.85 s


### Convert results into pandas dataframe and preview results

In [14]:
results_data = pandas.DataFrame(results, columns = ['euclidean_distance', 'comparison_index'])
results_data['base_index'] = results_data.index

## Join base_data, results_data and comparison_data

### Create index columns (used to join tables)

In [15]:
base_data['base_index'] = base_data.index
comparison_data['comparison_index'] = comparison_data.index

### Join base_data to results_data 

In [16]:
base_results_data = pandas.merge(left = base_data.drop('geopoint',axis = 1), 
                                 right = results_data,
                                 how = "left", 
                                 on = 'base_index')

### Join resulting table above to comparison_data

In [17]:
final_data = pandas.merge(left = base_results_data, 
                          right = comparison_data.drop('geopoint', axis = 1),
                          how = "left", 
                          on = "comparison_index", 
                          suffixes = ['_base', '_comparison'])

In [18]:
final_data.head()

Unnamed: 0,longitude,latitude,base_index,euclidean_distance,comparison_index,ADDRESS_DETAIL_PID,DATE_CREATED_x,DATE_LAST_MODIFIED,DATE_RETIRED_x,BUILDING_NAME,...,DATE_CREATED,DATE_RETIRED,STREET_CLASS_CODE,STREET_NAME,STREET_TYPE_CODE,STREET_SUFFIX_CODE,LOCALITY_PID_y,GNAF_STREET_PID,GNAF_STREET_CONFIDENCE,GNAF_RELIABILITY_CODE
0,149.038706,-32.807752,0,3.395423,3560189,GAVIC424267887,2008-12-24,2017-10-18,,,...,2017-11-01,,C,MURRAY RIVER,ROAD,,VIC2526,252865452.0,2.0,4
1,148.792731,-37.392214,1,0.067232,3588902,GAVIC425242632,2014-07-17,2014-11-04,,,...,2015-03-23,,C,ADA DIVIDE,TRACK,,VIC1056,255295676.0,0.0,4
2,146.864124,-39.532611,2,0.735435,3610091,GAVIC424275027,2008-12-29,2018-07-31,,NATIONAL PARK,...,2017-02-02,,C,WILSONS PROMONTORY,ROAD,,VIC2517,252880947.0,0.0,4
3,142.926599,-32.013216,3,2.271277,1135643,GAVIC424781011,2012-10-16,2014-11-04,,,...,2017-11-01,,C,CURETON,AVENUE,,VIC1676,253030487.0,2.0,4
4,146.032411,-37.946317,4,0.014011,1891983,GAVIC425023146,2012-07-13,2014-11-04,,,...,2018-05-12,,C,MCKENZIE,ROAD,,VIC1904,253048124.0,2.0,4


## Export final data to csv

In [19]:
final_data.to_csv('/users/danielcorcoran/desktop/closest_points.csv', index = False)