In [192]:
import shapefile
import json
import pandas as pd
import os
import zipfile
import codecs
import io

from bokeh.plotting import figure, output_file, show, hplot
from bokeh.models import Rect, HoverTool, ColumnDataSource

from collections import OrderedDict


In [359]:
__author__ = 'James Macdonald'
__copyright__ = 'Copyright 2015, James Macdonald'
__license__ = 'GPL'
__version__ = '0.1'
__maintainer__ = 'James Macdonald'
__email__ = 'maacca@gmail.com'
__thanks__ = ['Lee Rhiannon','Freya Newman','Denise Abou Hamad']

In [148]:
def shapes_from_zip(shapezip):
    '''Reads shapefiles packaged in zip'''
    zipshape = zipfile.ZipFile(shapezip)
    ishp = [f for f in zipshape.namelist() if '.shp' in f][0]
    ishx = [f for f in zipshape.namelist() if '.shx' in f][0]
    idbf = [f for f in zipshape.namelist() if '.dbf' in f][0]
    partdic = {}
    for part in [ishp,ishx,idbf]:
        partdic[part] = io.BytesIO(zipshape.read(part))
    return shapefile.Reader(shp=partdic[ishp],shx=partdic[ishx],dbf=partdic[idbf])

def data_from_zip(datazip):
    '''Extracts a dataframe from a zipfile assuming csv format'''
    zipdata = zipfile.ZipFile(datazip)
    df_dic = {}
    for pos,title in enumerate(zipdata.namelist()):
        df_dic[pos] = pd.read_csv(io.BytesIO(zipdata.read(title)), skiprows=3)
        df_dic[pos] = df_dic[pos][[col for col in df_dic[pos].columns.values if 'Unnamed:' in col]]
    return df_dic



In [362]:
# Extract school data as dummy data 
def get_test_data():
    df = pd.read_csv('Data/School_Completions_By_Postcode.csv',skiprows=3)
    df = df[[col for col in df.columns.values if 'Unnamed' not in col]]
    df = df[pd.notnull(df['Total'])]
    df['postcode'] = df['Highest Year of School Completed (HSCP)'].str.split(", ").apply(lambda x: x[0])
    df = df[df['postcode'].apply(lambda x: x.isnumeric())]
    df['State'] = df['Highest Year of School Completed (HSCP)'].str.split(", ").apply(lambda x: x[1])
    return df

df = get_test_data()


In [181]:
def shapefile_to_dataframe(processed_shapefile):
    '''Return a pandas dataframe from a shapefile object
    
    Three columns returned
    df['lat'] - the latitude of each point of the shape
    df['lng'] - the longitude of each point of the shape
    df['name'] - the name of the shape
    '''
    df = pd.DataFrame()
    shape_records = processed_shapefile.shapeRecords()
    shapes = [i.shape for i in shape_records]
    records = [i.record for i in shape_records]
    if str(records[0][1]) == '105051100':
        record_pos = 2
    else:
        record_pos = 1
    #shapes = [i.shape for i in shape_records]
    #shapes = pd.Series(shapes)
    locs = []
    rec = []
    for shapeObj,record in zip(shapes, records):
        points = []
        try:
            num_parts = len(shapeObj.parts)
        except:
            continue
        end = len(shapeObj.points) - 1
        segments = list(shapeObj.parts) + [end]
        for i in range(num_parts):
            points.append(shapeObj.points[ segments[i]:segments[i+1]])
        for point in points:
            locs.append(point)
            for_rec = record[record_pos]
            rec.append(for_rec)
    lat = []
    lng = []
    for loc in locs:
        lat.append([pair[0] for pair in loc])
        lng.append([pair[1] for pair in loc])
    df['lat'] = lat
    df['lng'] = lng
    df['name'] = rec
    return(df)



In [194]:
def plot_from_df(output_file_name, df, size):
    '''Creates an html file with a choropleth map in a selected region

    Variables: 
    output_file_name - the location of the file to be saved
    variable - numeric variable against which to shade the postcodes
    scope - determines how large an area is covered by the generated map
    geo_filter - if a scope is given, this specifies the area
    size = how big in pixels the resulting map should be
    
    '''
    data_source = ColumnDataSource (
    data = dict(
            lat_list=df['lat'],
            lng_list=df['lng'],
            #color=colors['hex'],
            name=df['name'],
            #data=colors[variable],
            #suburbs=colors['suburb'],
            #state=colors['state']
    )
    )
    output_file(output_file_name)
    #vaariable = '$'+variable
    TOOLS="pan,wheel_zoom,box_zoom,reset,hover,save"
    #if geo_filter == 'None':
    #    title_str = 'Australia'
    #else:
    #    title_str = geo_filter
    p = figure(#title=variable.capitalize()+" in "+title_str.capitalize()+" by Postcode", 
        plot_width = size, plot_height=size, tools=TOOLS)
    #p.xgrid.grid_line_color = None
    #p.ygrid.grid_line_color = None
    p.patches('lat_list',
              'lng_list',
              fill_color = 'black', fill_alpha=0.7,
              line_color = 'green', line_width = 0.5,
             source = data_source)
    hover = p.select(dict(type=HoverTool))
    hover.point_policy = "follow_mouse"
    hover.tooltips = OrderedDict([
            ("Name", "@name"),
            #(variable, '@data'),
            #("Suburbs",'@suburbs'),
            #("State",'@state'),
    ])
    show(p)
    
    

In [352]:
def get_SA1_lookup(processed_shapefile):
    sa1_lookup = pd.DataFrame()
    a = pd.Series(processed_shapefile.records())
    for i in range(len(a[0])):
        sa1_lookup[i] = a.apply(lambda x: x[i])
    sa1_lookup.columns = ['SA1 Code','SA1 Suffix','SA1 Prefix','SA2 Code','SA2','SA3 Code','SA3','SA4 Code','SA4',
               'StatePart Code','StatePart','State Code','State','Number']
    return sa1_lookup


In [443]:
def sa1_shapefile_to_dataframe(processed_shapefile, all_levels=True, get_electorates=True):
    '''Read the shapefile relating to SA1 statistical areas'''
    df = pd.DataFrame()
    shape_records = processed_shapefile.shapeRecords()
    shapes = [i.shape for i in shape_records]
    records = [i.record for i in shape_records]
    locs = []
    rec = []
    state=[]
    for shapeObj,record in zip(shapes, records):
        points = []
        try:
            num_parts = len(shapeObj.parts)
        except:
            continue
        end = len(shapeObj.points) - 1
        segments = list(shapeObj.parts) + [end]
        for i in range(num_parts):
            points.append(shapeObj.points[ segments[i]:segments[i+1]])
        for point in points:
            locs.append(point)
            for_rec = record[0]
            for_state = record[12]
            rec.append(for_rec)
            state.append(for_state)
    lat = []
    lng = []
    for loc in locs:
        lat.append([pair[0] for pair in loc])
        lng.append([pair[1] for pair in loc])
    df['lat'] = lat
    df['lng'] = lng
    df['name'] = rec
    df['state'] = state
    
    
    #extract all the layers of the record and merge back into the dataframe
    if all_levels == True:
        df = df.merge(get_SA1_lookup(processed_shapefile).drop_duplicates(), how="left", left_on='name', right_on='SA1 Code')
    # read in the postcode and electorate files to build out the frame
    sa1_pc = pd.read_csv('Data/POA_2011_AUST.csv')
    sa1_pc['merger'] = sa1_pc['SA1_MAINCODE_2011'].apply(lambda x: str(x))
    sa1_pc = sa1_pc.drop_duplicates()
    if get_electorates == True:
        pc_2_elec = pd.read_csv('Data/pc_2_electorate.csv')
        pc_2_elec['postcode'] = pc_2_elec['postcode'].astype(str).str.zfill(4)
        pc_2_elec = pc_2_elec.drop_duplicates()
        sa1_pc = pd.merge(pc_2_elec, sa1_pc,how="left", left_on="postcode", right_on='merger').drop_duplicates()
        #df = df.merge(pc_2_elec[['postcode','Electorate']], left_on="POA_NAME_2011",right_on='postcode')
        df = df.merge(sa1_pc, how="left", left_on='name', right_on='merger')
    return df



In [311]:
def set_hue(cdf,selected_var, cmaptype = matplotlib.cm.GnBu):
    cdf = cdf[[selected_var]]
    colours = cmaptype(cdf[
            selected_var
        ]/cdf[
            selected_var
        ].max())
    for pos,val in enumerate(['R','G','B','Int']):
        cdf[val] = colours[:,pos]
    cdf['rgb'] = list(zip(cdf['R'],cdf['G'],cdf['B'],cdf['Int']))
    cdf['hex'] = cdf['rgb'].apply(lambda x: matplotlib.colors.rgb2hex(x))
    return cdf

In [447]:
def plot_sa1_from_df(output_file_name, df, variable,
                     sa1_df = sa1_frame, scope='Electorate',
                     geo_filter=['Grayndler'], size=1000, icmap=matplotlib.cm.GnBu):
    '''Creates an html file with a choropleth map in a selected region

    Variables: 
    output_file_name - the location of the file to be saved
    variable - numeric variable against which to shade the postcodes
    scope - determines how large an area is covered by the generated map
    geo_filter - if a scope is given, this specifies the area
    size = how big in pixels the resulting map should be
    
    '''
    #Trim the geodata to only those SA1s within the area specified
    if type(geo_filter)==list:
        sa1_df = sa1_df[sa1_df[scope].isin(geo_filter)]
    else:
        sa1_df = sa1_df[sa1_df[scope] == geo_filter]
    
    #trim the postcodes to only those in scope, then get rid of Lord Howe Island because it's so remote
    df = df[df['postcode'].isin(sa1_df['postcode'])]
    df = df.merge(set_hue(df,variable,cmaptype=icmap),how="left", on=variable)
    df = df.drop_duplicates()
    df = pd.merge(sa1_df, df, how="left", on="postcode")
    df = df[df['postcode'] != '2898']
    df['lat'] = df['lat'].apply(lambda x: repr(x))
    df['lng'] = df['lng'].apply(lambda x: repr(x))
    df = df.drop_duplicates()
    df['lat'] = df['lat'].apply(lambda x: literal_eval(x))
    df['lng'] = df['lng'].apply(lambda x: literal_eval(x))
    
    data_source = ColumnDataSource (
    data = dict(
            lat_list=df['lat'],
            lng_list=df['lng'],
            color=df['hex'],
            name=df['name'],
            data=df[variable],
            postcode=df['postcode'],
            electorate=df['Electorate'],
            #suburbs=colors['suburb'],
            #state=colors['state']
    )
    )
    output_file(output_file_name)
    #vaariable = '$'+variable
    TOOLS="pan,wheel_zoom,box_zoom,reset,hover,save"
    #if geo_filter == 'None':
    #    title_str = 'Australia'
    #else:
    #    title_str = geo_filter
    p = figure(#title=variable.capitalize()+" in "+title_str.capitalize()+" by Postcode", 
        plot_width = size, plot_height=size, tools=TOOLS)
    #p.xgrid.grid_line_color = None
    #p.ygrid.grid_line_color = None
    p.patches('lat_list',
              'lng_list',
              fill_color = 'color', fill_alpha=0.7,
              line_color = 'green', line_width = 0.5,
             source = data_source)
    hover = p.select(dict(type=HoverTool))
    hover.point_policy = "follow_mouse"
    hover.tooltips = OrderedDict([
            ("Name", "@name"),
            (variable, '@data'),
            ('Postcode', "@postcode"),
            ('Electorate',"@electorate")
            #("Suburbs",'@suburbs'),
            #("State",'@state'),
    ])
    legend=figure(width=100,height=size)
    legend.toolbar_location=None
    legend.rect(x=0.5,y=df[variable],color=df['hex'],height=10,width=10)
    legend.xaxis.visible = None
    legend.xgrid.grid_line_color = None
    legend.ygrid.grid_line_color = None
    layout = hplot(p,legend)
    show(layout)
    return(df)


In [446]:
if __name__ == "__main__":
    start = timeit.default_timer()
    sa1_frame = sa1_shapefile_to_dataframe(shapes_from_zip('Shapefiles/1270055001_sa1_2011_aust_shape.zip'))
    #Plotting example - Lord Howe Island excluded
    adf = plot_sa1_from_df('Grayndler_test.html', df, 'Did not go to school', 
                 sa1_df = example, scope='Electorate', geo_filter=['Sydney','Barton','Watson','Reid','Grayndler','Wentworth','Kingsford Smith'],
                 size=1000, icmap=matplotlib.cm.GnBu)
    stop = timeit.default_timer()
    print(stop-start)

62232
62232
62232
81

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



103
81
9862
54.518479819002096


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
'''
#Test filtering across 

codefilter = sa1_lookup[sa1_lookup['SA3'].str.contains("Eastern Sub")]['SA1 Suffix']
plot_from_df('sa1_test.html',example[example['name'].isin(codefilter)],1000)
'''

In [288]:
'''
#Merge the SA1 Lookups into the dataframe to retrieve the postcode and electorate information

sa1_pc = pd.read_csv('Data/POA_2011_AUST.csv')
sa1_pc['col_for_merge'] = sa1_pc['SA1_MAINCODE_2011'].apply(lambda x: str(x))

example = example.merge(sa1_pc, how="left", left_on='name', right_on='col_for_merge')

pc_2_elec = pd.read_csv('Data/pc_2_electorate.csv')
pc_2_elec['postcode'] = pc_2_elec['postcode'].astype(str).str.zfill(4)

example = example.merge(pc_2_elec[['postcode','Electorate']], left_on="POA_NAME_2011",right_on='postcode')'''

In [191]:
'''
#example = shapes_from_zip('Shapefiles/2011_POA_shape.zip')
#example = shapes_from_zip('Shapefiles/1259030001_sla11aaust_shape.zip')
#example = shapes_from_zip('Shapefiles/1259030001_ste11aaust_shape.zip')
#adf = shapefile_to_dataframe(example)
'''