### Import Modules

In [1]:
# Shapefile reading and manipulation

import fiona
from shapely.geometry import shape, Point
from shapely import speedups
speedups.enable()

# Data manipulation
import pandas
import numpy

### Set paths
- Shapefiles downloaded from [here](https://www.abs.gov.au/AUSSTATS/abs@.nsf/DetailsPage/1270.0.55.001July%202016?OpenDocument)

In [2]:
points_path = "/users/danielcorcoran/desktop/DimAddress.csv"
shapefile_path = '../1270055001_sa1_2016_aust_shape/SA1_2016_AUST.shp'

### Import data and preview

In [None]:
# If reading data from a csv file
data = pandas.read_csv(points_path, sep = "\t")
data.head()

In [8]:
# If creating randomly generated data (for testing purposes only)
numpy.random.seed(0)

row_count = 250

data = pandas.concat([pandas.Series([numpy.random.uniform(143,144) for n in range(row_count)]), 
                      pandas.Series([numpy.random.uniform(-34,-38) for n in range(row_count)])], 
                     axis = 1)

data.columns = ['longitude', 'latitude']
data.head()

Unnamed: 0,longitude,latitude
0,143.548814,-35.422451
1,143.715189,-37.761728
2,143.602763,-37.061301
3,143.544883,-36.994654
4,143.423655,-37.614879


In [9]:
print('Checking data shape:\n{}'.format(data.shape))

Checking data shape:
(250, 2)


In [11]:
print('Checking data headers:\n{}'.format(list(data.columns)))

Checking data headers:
['longitude', 'latitude']


### Create list of polygons containing Victoria only from LGA shapefile

In [12]:
# Read in all polygons from the shapefile, it is Australia wide
all_polygons = [polygon for polygon in fiona.open(shapefile_path)]

In [13]:
# Preview the properties for one polygon.
# You will need this for later depending on which features you require after performing spatial join
all_polygons[0]['properties']

OrderedDict([('SA1_MAIN16', '10102100701'),
             ('SA1_7DIG16', '1100701'),
             ('SA2_MAIN16', '101021007'),
             ('SA2_5DIG16', '11007'),
             ('SA2_NAME16', 'Braidwood'),
             ('SA3_CODE16', '10102'),
             ('SA3_NAME16', 'Queanbeyan'),
             ('SA4_CODE16', '101'),
             ('SA4_NAME16', 'Capital Region'),
             ('GCC_CODE16', '1RNSW'),
             ('GCC_NAME16', 'Rest of NSW'),
             ('STE_CODE16', '1'),
             ('STE_NAME16', 'New South Wales'),
             ('AREASQKM16', 362.8727)])

In [14]:
# Store polygons from Victoria only in a new list
vic_polygons = []

for polygon in all_polygons:
    if polygon['properties']['STE_NAME16'] == 'Victoria' and polygon['geometry'] is not None:
        vic_polygons.append(polygon)
        
print("Found {} polygons in shapefile within victoria.".format(len(vic_polygons)))

Found 14069 polygons in shapefile within victoria.


### Build spatial tree

In [15]:
# Build the spatial index
from rtree import index
rtree_index = index.Index()

In [16]:
simplified_polygons = []
tolerance = 0.0005

for position, polygon in enumerate(vic_polygons):
    rtree_index.insert(position, shape(polygon['geometry']).bounds)
    shapeobject = shape(polygon['geometry'])
    simplified_polygons.append(shapeobject.simplify(tolerance = tolerance))

### Process  (with apply)

In [17]:
# Create function to return the properties for a shapefile if a match is found in rtree_index
# You can swap out the if statement to use the simplified polygon list, doing so will trade speed for accuracy
# Speed will also be dictated by the tolerance variable set above
def return_properties(point):

    for index in rtree_index.intersection(point.coords[0]):
        if point.within(shape(vic_polygons[index]['geometry'])):
        #if point.within(simplified_polygons[index]):
            return vic_polygons[index]['properties']

In [19]:
# Create new features with point objects using long and lat features
data['point'] = (list(zip(data['longitude'], data['latitude'])))
data['shapely_point'] = data['point'].apply(Point)

In [20]:
# Using multiple processors to compute results
from multiprocessing import Pool
p = Pool(4)

In [21]:
%%time

data['results'] = list(p.map(return_properties, data["shapely_point"]))

# Join original data with results series split into a new dataframe
# (one column for each property)
data = pandas.concat([data, data['results'].apply(pandas.Series)], axis = 1)

# Drop irrelevant columns
data.drop(['shapely_point', 'results', 'point'], axis = 1, inplace = True)

CPU times: user 108 ms, sys: 11.1 ms, total: 119 ms
Wall time: 184 ms


In [24]:
# The column_header variable can be located in the polygon 'properties' key
column_header = 'SA1_MAIN16'
points_found = data[column_header].notnull().sum()
total_points = data[column_header].shape[0]
percentage = points_found / total_points
print('Points found {}. Total Points {}. % Points found {:.2%}'.format(points_found, total_points, percentage))

Points found 176. Total Points 250. % Points found 70.40%


### Preview

In [25]:
# Viewing the first 10 rows of results
data.head(10)

Unnamed: 0,longitude,latitude,SA1_MAIN16,SA1_7DIG16,SA2_MAIN16,SA2_5DIG16,SA2_NAME16,SA3_CODE16,SA3_NAME16,SA4_CODE16,SA4_NAME16,GCC_CODE16,GCC_NAME16,STE_CODE16,STE_NAME16,AREASQKM16
0,143.548814,-35.422451,21503140506,2140506,215031405,21405,Swan Hill Region,21503,Murray River - Swan Hill,215,North West,2RVIC,Rest of Vic.,2,Victoria,462.8509
1,143.715189,-37.761728,20103101502,2101502,201031015,21015,Golden Plains - North,20103,Maryborough - Pyrenees,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,140.1720
2,143.602763,-37.061301,20103101707,2101707,201031017,21017,Maryborough Region,20103,Maryborough - Pyrenees,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,237.0042
3,143.544883,-36.994654,20103101305,2101305,201031013,21013,Avoca,20103,Maryborough - Pyrenees,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,330.8280
4,143.423655,-37.614879,20103101410,2101410,201031014,21014,Beaufort,20103,Maryborough - Pyrenees,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,245.8686
5,143.645894,-34.333690,,,,,,,,,,,,,,
6,143.437587,-36.208770,21503140009,2140009,215031400,21400,Buloke,21503,Murray River - Swan Hill,215,North West,2RVIC,Rest of Vic.,2,Victoria,824.4143
7,143.891773,-36.337904,20203103305,2103305,202031033,21033,Loddon,20203,Loddon - Elmore,202,Bendigo,2RVIC,Rest of Vic.,2,Victoria,455.7779
8,143.963663,-37.847746,20301103510,2103510,203011035,21035,Golden Plains - South,20301,Barwon - West,203,Geelong,2RVIC,Rest of Vic.,2,Victoria,208.8590
9,143.383442,-35.168590,,,,,,,,,,,,,,


### Export

In [26]:
# Export to directory above root
data.to_csv('../spatial_join_results.csv', index = False)