# DATA2001 2021 Assignment
*An analysis of neighbourhood fire risk and median income & rent.*



In [1]:
import pandas as pd
import geopandas as gpd
import os
import numpy as np
import json

from shapely.geometry import Point, Polygon, MultiPolygon
from geopandas import GeoSeries, GeoDataFrame
from geoalchemy2 import Geometry, WKTElement
from sqlalchemy import *
from sqlalchemy import create_engine
import psycopg2
import psycopg2.extras

import matplotlib.pyplot as plt

In [2]:
# ------ Internal use MATTHEW
#For troubleshooting in the case your data isn't loading.
#os.chdir('M:\\Jupyter Notebooks\\data2001_project')
#os.getcwd()

## Task 1: Data Integration and Database Generation

### Loading datasets (standard dataframes)

In [2]:
stat_areas_df = pd.read_csv('./data/StatisticalAreas.csv')
stat_areas_df.head(2)

Unnamed: 0,area_id,area_name,parent_area_id
0,1,New South Wales,0
1,10,Greater Sydney,1


In [27]:
nbhd_df = pd.read_csv('./data/Neighbourhoods.csv')
nbhd_df.head(1)

Unnamed: 0,area_id,area_name,land_area,population,number_of_dwellings,number_of_businesses,median_annual_household_income,avg_monthly_rent
0,102011028,Avoca Beach - Copacabana,643.8,7590,2325,738.0,46996.0,1906.0


In [37]:
len(nbhd_df)

322

In [3]:
busi_stat_df = pd.read_csv('./data/BusinessStats.csv')
busi_stat_df.head(1)

Unnamed: 0,area_id,area_name,number_of_businesses,accommodation_and_food_services,retail_trade,agriculture_forestry_and_fishing,health_care_and_social_assistance,public_administration_and_safety,transport_postal_and_warehousing
0,101021007,Braidwood,629,26,27,280,11,0,35


### Loading datasets (geodataframes)
#### RFS NSW Bushfire Prone Land - shapefile

In [43]:
rfs_gdf = gpd.read_file('./data/RFSNSW_BFPL/RFSNSW_BFPL.shp')
print(rfs_gdf.crs) # Check EPSG / CRS -- 4283 = GDA94
rfs_gdf.columns = [x.lower() for x in rfs_gdf.columns] # lower case col names
rfs_gdf.head(1)

epsg:4283


Unnamed: 0,category,shape_leng,shape_area,geometry
0,1,1.7e-05,5.3924e-12,POINT (149.11319 -33.05824)


In [16]:
# Check geometries
print(len(rfs_gdf))
rfs_gdf.geometry.type.value_counts()

516633


Point    516633
dtype: int64

In [44]:
# Recreate incrementing 'gid' (0 index)
rfs_gdf.insert(loc=0, column='gid', value=rfs_gdf.index)
rfs_gdf.head()

Unnamed: 0,gid,category,shape_leng,shape_area,geometry
0,0,1,1.7e-05,5.3924e-12,POINT (149.11319 -33.05824)
1,1,1,0.000178,1.140005e-09,POINT (152.27536 -29.68316)
2,2,1,0.00089,4.950178e-08,POINT (152.14244 -29.68266)
3,3,1,0.000442,8.094091e-09,POINT (152.27579 -29.68259)
4,4,1,0.00089,4.950155e-08,POINT (151.99619 -29.68131)


#### ABS Statistical Area 2 (2016) - shapefile

In [6]:
sa2_gdf = gpd.read_file('./data/1270055001_sa2_2016_aust_shape/SA2_2016_AUST.shp')
print(sa2_gdf.crs) # Check EPSG / CRS -- 4283 = GDA94
sa2_gdf.columns = [x.lower() for x in sa2_gdf.columns] # lower case col names
sa2_gdf.head(1)

epsg:4283


Unnamed: 0,sa2_main16,sa2_5dig16,sa2_name16,sa3_code16,sa3_name16,sa4_code16,sa4_name16,gcc_code16,gcc_name16,ste_code16,ste_name16,areasqkm16,geometry
0,101021007,11007,Braidwood,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,3418.3525,"POLYGON ((149.58423 -35.44427, 149.58444 -35.4..."


In [7]:
# Recreate incrementing 'gid' (0 index)
sa2_gdf.insert(loc=0, column='g_id', value=sa2_gdf.index)
sa2_gdf.head()

Unnamed: 0,g_id,sa2_main16,sa2_5dig16,sa2_name16,sa3_code16,sa3_name16,sa4_code16,sa4_name16,gcc_code16,gcc_name16,ste_code16,ste_name16,areasqkm16,geometry
0,0,101021007,11007,Braidwood,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,3418.3525,"POLYGON ((149.58423 -35.44427, 149.58444 -35.4..."
1,1,101021008,11008,Karabar,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,6.9825,"POLYGON ((149.21898 -35.36739, 149.21799 -35.3..."
2,2,101021009,11009,Queanbeyan,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,4.7634,"POLYGON ((149.21325 -35.34325, 149.21619 -35.3..."
3,3,101021010,11010,Queanbeyan - East,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,13.0034,"POLYGON ((149.24033 -35.34782, 149.24023 -35.3..."
4,4,101021011,11011,Queanbeyan Region,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,3054.4099,"POLYGON ((149.23580 -35.38738, 149.23771 -35.3..."


In [15]:
# Check geometries
print(len(sa2_gdf))
print(sa2_gdf.geometry.type.value_counts())
no_geoms_sa2 = sum(sa2_gdf.geometry.type.isna())
print(f'Null geometry count: {no_geoms_sa2}')

2310
Polygon         2083
MultiPolygon     209
dtype: int64
Null geometry count: 18


In [19]:
# Inspection / preliminary exploration
print(sa2_gdf.ste_name16.value_counts()) # Confirm federal level dataset
print("\n")
print(sa2_gdf.loc[sa2_gdf.ste_name16=="New South Wales"]['gcc_name16'].value_counts()) # Confirm Greater Sydney GCCSA

New South Wales                 578
Queensland                      530
Victoria                        464
Western Australia               254
South Australia                 174
Australian Capital Territory    133
Tasmania                        101
Northern Territory               70
Other Territories                 6
Name: ste_name16, dtype: int64


Greater Sydney                           312
Rest of NSW                              264
Migratory - Offshore - Shipping (NSW)      1
No usual address (NSW)                     1
Name: gcc_name16, dtype: int64


### Connecting with Database and Creation of Tables

In [8]:
### ---------------ATTN MARKER - RUN THIS---------------
# ASSUME THIS WORKS MOST GENERALLY E.g. FOR RUNNING IN USYD SERVER

# Function for accessing Postgres DB (SOURCE: DATA2001 Lab materials) - Eugene

def pgconnect_using_credfile(credential_filepath):
    try:
        with open(credential_filepath) as f:
            db_conn_dict = json.load(f)
        connstring = 'postgres+psycopg2://'+db_conn_dict['user']+':'+db_conn_dict['password']+'@'+db_conn_dict['host']+'/'+db_conn_dict['database']
        db = create_engine(connstring, echo=False)
        conn = db.connect()
        print('connected')
    except Exception as e:
        print("unable to connect to the database")
        print(e)
        return None
    return db,conn

print("function setup successful.")

function setup successful.


In [9]:
### ---------------ATTN MARKER - DO *NOT* RUN---------------
# Alternative Function for accessing Postgres DB (SOURCE: DATA2001 Lab materials) - Matthew
# JUST FOR MATTHEW

def pgconnect_using_credfile(credential_filepath):
    try:
        args = {
            'sslmode':'disable',
            'gssencmode':'disable'
        }
        with open(credential_filepath) as f:
            db_conn_dict = json.load(f)
        connstring = 'postgresql+psycopg2://'+db_conn_dict['user']+':'+db_conn_dict['password']+'@'+db_conn_dict['host']+'/'+db_conn_dict['database']
        db = create_engine(connstring, echo=False, connect_args=args)
        conn = db.connect()
        print('connected')
    except Exception as e:
        print("unable to connect to the database")
        print(e)
        return None
    return db,conn

In [9]:
# Connect to University server student Postgres DB

credfilepath = './data2x01_db.json' # Internal note: not tracked on Git, must be locally available. 
# Eugene's JSON to be uploaded in submission

db, conn = pgconnect_using_credfile(credfilepath)

connected


In [10]:
# Function for querying the PostgreSQL DB.
# Returns value and a converted dataframe (SOURCE: DATA2001 Lab materials)

def pgquery( conn, sqlcmd, args=None, silent=False ):
    """ utility function to execute some SQL query statement
    can take optional arguments to fill in (dictionary)
    will print out on screen the result set of the query
    error and transaction handling built-in """
    retdf = pd.DataFrame()
    retval = False
    try:
        if args is None:
            retdf = pd.read_sql_query(sqlcmd,conn)
        else:
            retdf = pd.read_sql_query(sqlcmd,conn,params=args)
        if silent == False:
            print(retdf.shape)
            print(retdf.to_string())
        retval = True
    except Exception as e:
        if silent == False:
            print("db read error: ")
            print(e)
    return retval,retdf

print("function setup successful.")

function setup successful.


In [11]:
# Checking we have PostGIS working on our connection (SOURCE: DATA2001 Lab materials)

postgis_check = '''
SELECT PostGIS_Version();
'''

retval,retdf = pgquery(conn,postgis_check)
retdf

(1, 1)
                         postgis_version
0  3.0 USE_GEOS=1 USE_PROJ=1 USE_STATS=1


Unnamed: 0,postgis_version
0,3.0 USE_GEOS=1 USE_PROJ=1 USE_STATS=1


### Creation of Database Tables (from dataframes)

In [25]:
# Check existing tables in Postgres DB public schema
print(db.table_names())

['spatial_ref_sys', 'statisticalareas', 'neighbourhoods', 'businessstats', 'sa2_2016_aust', 'rfsnsw_bfpl']


#### Statistical Areas

In [36]:
stat_areas_create = '''CREATE TABLE statisticalareas (
                     area_id VARCHAR(9) PRIMARY KEY,
                     area_name VARCHAR(100),
                     parent_area_id VARCHAR(9)
                     )'''

conn.execute("DROP TABLE IF EXISTS statisticalareas")
conn.execute(stat_areas_create)

<sqlalchemy.engine.result.ResultProxy at 0x12ae5b8d0>

In [37]:
# Insert dataset
stat_areas_df.to_sql('statisticalareas', con = conn, if_exists = 'replace', index=False)
print('Data inserted into Table')

# Check table
a_response, a_df = pgquery(conn, """SELECT * FROM statisticalareas
LIMIT 1;""")
a_df.head()

Data inserted into Table
(1, 3)
   area_id        area_name  parent_area_id
0        1  New South Wales               0


Unnamed: 0,area_id,area_name,parent_area_id
0,1,New South Wales,0


#### Neighbourhoods

In [38]:
neighbourhoods_create = '''CREATE TABLE neighbourhoods (
                     area_id CHAR(9) PRIMARY KEY,
                     area_name VARCHAR(100),
                     land_area FLOAT,
                     population NUMERIC,
                     number_of_dwellings NUMERIC,
                     number_of_businesses NUMERIC,
                     median_annual_household_income NUMERIC,
                     avg_monthly_rent NUMERIC
                     )'''

conn.execute("DROP TABLE IF EXISTS neighbourhoods")
conn.execute(neighbourhoods_create)

<sqlalchemy.engine.result.ResultProxy at 0x12ae503d0>

In [39]:
# Insert dataset
nbhd_df.to_sql('neighbourhoods', con = conn, if_exists = 'replace', index=False)
print('Data inserted into Table')

# Check table
a_response, a_df = pgquery(conn, """SELECT * FROM neighbourhoods
LIMIT 1;""")
a_df.head()

Data inserted into Table
(1, 8)
     area_id                 area_name  land_area population number_of_dwellings  number_of_businesses  median_annual_household_income  avg_monthly_rent
0  102011028  Avoca Beach - Copacabana      643.8       7590                2325                 738.0                         46996.0            1906.0


Unnamed: 0,area_id,area_name,land_area,population,number_of_dwellings,number_of_businesses,median_annual_household_income,avg_monthly_rent
0,102011028,Avoca Beach - Copacabana,643.8,7590,2325,738.0,46996.0,1906.0


#### Business Stats

In [40]:
business_create = '''CREATE TABLE businessstats (
                     area_id CHAR(9) PRIMARY KEY,
                     area_name VARCHAR(100),
                     number_of_businesses NUMERIC,
                     accommodation_and_food_services NUMERIC,
                     retail_trade NUMERIC,
                     agriculture_forestry_and_fishing NUMERIC,
                     health_care_and_social_assistance NUMERIC,
                     public_administration_and_safety NUMERIC,
                     transport_postal_and_warehousing NUMERIC
                     )'''

conn.execute("DROP TABLE IF EXISTS businessstats")
conn.execute(business_create)

<sqlalchemy.engine.result.ResultProxy at 0x12ae5e3d0>

In [41]:
busi_stat_df.to_sql('businessstats', con = conn, if_exists = 'replace', index=False)
print('Data inserted into Table')

# Check table
a_response, a_df = pgquery(conn, """SELECT * FROM businessstats
LIMIT 1;""")
a_df.head()

Data inserted into Table
(1, 9)
     area_id  area_name  number_of_businesses  accommodation_and_food_services  retail_trade  agriculture_forestry_and_fishing  health_care_and_social_assistance  public_administration_and_safety  transport_postal_and_warehousing
0  101021007  Braidwood                   629                               26            27                               280                                 11                                 0                                35


Unnamed: 0,area_id,area_name,number_of_businesses,accommodation_and_food_services,retail_trade,agriculture_forestry_and_fishing,health_care_and_social_assistance,public_administration_and_safety,transport_postal_and_warehousing
0,101021007,Braidwood,629,26,27,280,11,0,35


### Creation of Database Tables (from geodataframes)

#### RFS NSW Bushfire Prone Land - shapefile

In [41]:
# WKT point geom creation function (SOURCE: DATA2001 Lab materials)

def create_wkt_point_element(geom,srid):
    return WKTElement(geom.wkt, srid)

In [45]:
srid = 4283
rfs_gdf['geom'] = rfs_gdf['geometry'].apply(lambda x: create_wkt_point_element(geom=x, srid=srid))
rfs_gdf = rfs_gdf.drop(columns="geometry")
rfs_gdf.head()

Unnamed: 0,gid,category,shape_leng,shape_area,geom
0,0,1,1.7e-05,5.3924e-12,POINT (149.1131894786667 -33.05824346699998)
1,1,1,0.000178,1.140005e-09,POINT (152.2753625074807 -29.68315654934266)
2,2,1,0.00089,4.950178e-08,POINT (152.1424400005001 -29.68265650149996)
3,3,1,0.000442,8.094091e-09,POINT (152.2757861369404 -29.68259377308781)
4,4,1,0.00089,4.950155e-08,POINT (151.996189999 -29.68130649949998)


In [46]:
rfs_bushfire_create = '''CREATE TABLE rfsnsw_bfpl (
                     gid INTEGER PRIMARY KEY,
                     category CHAR(1),
                     shape_leng FLOAT,
                     shape_area FLOAT,
                     geom GEOMETRY(POINT, 4283)
                     )'''

conn.execute("DROP TABLE IF EXISTS rfsnsw_bfpl")
conn.execute(rfs_bushfire_create)

<sqlalchemy.engine.result.ResultProxy at 0x13a695610>

In [47]:
srid = 4283
rfs_gdf.to_sql('rfsnsw_bfpl', conn, if_exists='append', index=False, 
                         dtype={'geom': Geometry('POINT', srid)})

In [48]:
# Check table
a_response, a_df = pgquery(conn, """SELECT * FROM rfsnsw_bfpl
LIMIT 1;""")
a_df.head()

(1, 5)
   gid category  shape_leng    shape_area                                                geom
0    0        1    0.000017  5.392400e-12  0101000020BB100000FEA38A3F9FA362403CFC9C85748740C0


Unnamed: 0,gid,category,shape_leng,shape_area,geom
0,0,1,1.7e-05,5.3924e-12,0101000020BB100000FEA38A3F9FA362403CFC9C857487...


#### ABS Statistical Area 2 (2016) - shapefile

In [13]:
# WKT polygon geom creation function (SOURCE: DATA2001 Lab materials)

def create_wkt_element(geom,srid):
    if (geom.geom_type == 'Polygon'):
        geom = MultiPolygon([geom])
    return WKTElement(geom.wkt, srid)

In [14]:
srid = 4283

# DECISION: REMOVE THE 18 ROWS WHERE NO SPATIAL JOINS OR FUNCTIONS CAN BE PERFORMED (NULL GEOMETRIES)
sa2_gdf = sa2_gdf[sa2_gdf['geometry'].notna()]

# WKT CONVERSION
sa2_gdf['geom'] = sa2_gdf['geometry'].apply(lambda x: create_wkt_element(geom=x, srid=srid))
sa2_gdf = sa2_gdf.drop(columns="geometry")
sa2_gdf.head()

Unnamed: 0,g_id,sa2_main16,sa2_5dig16,sa2_name16,sa3_code16,sa3_name16,sa4_code16,sa4_name16,gcc_code16,gcc_name16,ste_code16,ste_name16,areasqkm16,geom
0,0,101021007,11007,Braidwood,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,3418.3525,MULTIPOLYGON (((149.5842329970001 -35.44426999...
1,1,101021008,11008,Karabar,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,6.9825,MULTIPOLYGON (((149.2189819130001 -35.36739402...
2,2,101021009,11009,Queanbeyan,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,4.7634,MULTIPOLYGON (((149.2132479820001 -35.34324799...
3,3,101021010,11010,Queanbeyan - East,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,13.0034,MULTIPOLYGON (((149.240332114 -35.347822616999...
4,4,101021011,11011,Queanbeyan Region,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,3054.4099,MULTIPOLYGON (((149.235800967 -35.387375302999...


In [38]:
sa2_shape_create = '''CREATE TABLE sa2_2016_aust (
                     g_id INTEGER PRIMARY KEY,
                     sa2_main16 VARCHAR(80),
                     sa2_5dig16 VARCHAR(80),
                     sa2_name16 VARCHAR(100),
                     sa3_code16 VARCHAR(80),
                     sa3_name16 VARCHAR(100),
                     sa4_code16 VARCHAR(80),
                     sa4_name16 VARCHAR(100),
                     gcc_code16 VARCHAR(80),
                     gcc_name16 VARCHAR(100),
                     ste_code16 VARCHAR(80),
                     ste_name16 VARCHAR(100),
                     areasqkm16 FLOAT,
                     geom GEOMETRY(MULTIPOLYGON, 4283)
                     )'''

conn.execute("DROP TABLE IF EXISTS sa2_2016_aust")
conn.execute(sa2_shape_create)

<sqlalchemy.engine.result.ResultProxy at 0x1335cb990>

In [39]:
srid = 4283
sa2_gdf.to_sql('sa2_2016_aust', conn, if_exists='append', index=False, 
                         dtype={'geom': Geometry('MULTIPOLYGON', srid)})

In [40]:
# Check table
a_response, a_df = pgquery(conn, """SELECT * FROM sa2_2016_aust
LIMIT 1;""")
a_df.head()

(1, 14)
   g_id sa2_main16 sa2_5dig16 sa2_name16 sa3_code16  sa3_name16 sa4_code16      sa4_name16 gcc_code16   gcc_name16 ste_code16       ste_name16  areasqkm16                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

Unnamed: 0,g_id,sa2_main16,sa2_5dig16,sa2_name16,sa3_code16,sa3_name16,sa4_code16,sa4_name16,gcc_code16,gcc_name16,ste_code16,ste_name16,areasqkm16,geom
0,0,101021007,11007,Braidwood,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,3418.3525,0106000020BB100000010000000103000000010000005F...


In [49]:
# Disconnect from DB

conn.close()
db.dispose()
print("disconnected")

disconnected


## Task 2: Fire Risk Analysis

### Fire Risk Score

$$fire_risk = S(z(population_density)+z(dwelling_&_business_density)+z(bfpl_density)−z(assistive_service_density))$$

In [21]:
#Z-score
def z(x, avg, sd):
    return((x-avg)/sd)

#Sigmoidal function. Did not use native exponential because fails for large negative values.
def sigmoid(x):
    return(1/(1+np.exp(-x)))

#Fire risk score formula.
def fire_risk(pop_d, dwell_bus_d, bfpl_d, ass_serv_d):
    fire_risk_score = sigmoid(z(pop_d)+z(dwell_bus_d)+z(bfpl_d)-z(ass_serv_d))
    return fire_risk_score

In [36]:
new_df = nbhd_df.merge(sa2_gdf, left_on='area_id', right_on='sa2_main16')
len(new_df)

322