# Calculate closest points

## Notes:
- Takes in two datasets containing longitude and latitude
- Checks which point in dataset B is closest to point in dataset A on record level and appends data from dataset B to that record.
- Exports resulting dataframe.
- Uses `KDtree` algorithm and pooling from `multiprocessing` to speed up run time.

## Setup

### Import relevant modules

In [1]:
import pandas
import datetime
import numpy
import math
from scipy import spatial
import random
from multiprocessing import Pool

  return f(*args, **kwds)
  return f(*args, **kwds)


### Import / Create Test Data

In [2]:
numpy.random.seed(0)

row_count = 2000
row_count2 = 10000

base_data = pandas.concat([pandas.Series([numpy.random.uniform(140,150) for n in range(row_count)]), 
                      pandas.Series([numpy.random.uniform(-32,-40) for n in range(row_count)])], 
                     axis = 1)
base_data.columns = ['longitude', 'latitude']


comparison_data = pandas.concat([pandas.Series([numpy.random.uniform(140,150) for n in range(row_count2)]), 
                      pandas.Series([numpy.random.uniform(-32,-40) for n in range(row_count2)])], 
                     axis = 1)
comparison_data.columns = ['longitude', 'latitude']

In [3]:
base_data.head()

Unnamed: 0,longitude,latitude
0,145.488135,-38.492148
1,147.151894,-35.808672
2,146.027634,-36.185248
3,145.448832,-34.004165
4,144.236548,-36.840344


In [4]:
comparison_data.head()

Unnamed: 0,longitude,latitude
0,142.92642,-34.56476
1,145.665183,-33.998935
2,141.374144,-32.248582
3,143.497122,-38.831382
4,140.532164,-32.594086


## Process 

### Create comparison points list

In [5]:
comparison_data['geopoint'] = list(zip(comparison_data['longitude'], comparison_data['latitude']))
base_data['geopoint'] = list(zip(base_data['longitude'], base_data['latitude']))

### Create Spatial KDTree Object

In [6]:
import sys
sys.setrecursionlimit(10000)

In [7]:
tree = spatial.KDTree(list(comparison_data['geopoint']))

### Helper function to return shortest distance and index using KDTree defined above

In [8]:
def return_distance_index_closest_point(point):
    
    distance_index_tuple = tree.query(point)
    return distance_index_tuple

### Calculate results

In [9]:
%%time

pool = Pool(6)
results = list(pool.map(return_distance_index_closest_point, base_data['geopoint']))

print("Checked {:,} base points against {:,} comparison points. {:,} Iterations".format(base_data.shape[0],
                                 comparison_data.shape[0],
                                 base_data.shape[0] * comparison_data.shape[0]))

Checked 2,000 base points against 10,000 comparison points. 20,000,000 Iterations
CPU times: user 18.1 ms, sys: 22.2 ms, total: 40.3 ms
Wall time: 115 ms


### Convert results into pandas dataframe and preview results

In [10]:
results_data = pandas.DataFrame(results, columns = ['euclidean_distance', 'comparison_index'])
results_data['base_index'] = results_data.index

## Join base_data, results_data and comparison_data

### Create index columns (used to join tables)

In [11]:
base_data['base_index'] = base_data.index
comparison_da`ta['comparison_index'] = comparison_data.index

### Join base_data to results_data 

In [12]:
base_results_data = pandas.merge(left = base_data.drop('geopoint',axis = 1), 
                                 right = results_data,
                                 how = "left", 
                                 on = 'base_index')

### Join resulting table above to comparison_data

In [13]:
final_data = pandas.merge(left = base_results_data, 
                          right = comparison_data.drop('geopoint', axis = 1),
                          how = "left", 
                          on = "comparison_index", 
                          suffixes = ['_base', '_comparison'])

In [14]:
final_data.head()

Unnamed: 0,longitude_base,latitude_base,base_index,euclidean_distance,comparison_index,longitude_comparison,latitude_comparison
0,145.488135,-38.492148,0,0.105521,8495,145.38374,-38.476771
1,147.151894,-35.808672,1,0.045165,8453,147.16863,-35.850621
2,146.027634,-36.185248,2,0.040061,3950,146.021346,-36.224812
3,145.448832,-34.004165,3,0.027558,6073,145.476111,-34.008075
4,144.236548,-36.840344,4,0.058634,6777,144.29518,-36.84073


## Export final data to csv

In [15]:
final_data.to_csv('/users/danielcorcoran/desktop/closest_points.csv', index = False)