# DATA2001 2021 Assignment
*An analysis of neighbourhood fire risk and median income & rent.*



In [6]:
import pandas as pd
import geopandas as gpd
import os
import numpy as np
import json

from shapely.geometry import Point, Polygon, MultiPolygon
from geopandas import GeoSeries, GeoDataFrame
from geoalchemy2 import Geometry, WKTElement
from sqlalchemy import *
from sqlalchemy import create_engine
import psycopg2
import psycopg2.extras

import matplotlib.pyplot as plt

In [None]:
#For troubleshooting in the case your data isn't loading.
#os.chdir('M:\\Jupyter Notebooks\\data2001_project')
os.getcwd()

## Task 1: Data Integration and Database Generation

### Loading the datasets

In [60]:
stat_areas_df = pd.read_csv('./data/StatisticalAreas.csv')
stat_areas_df.head(2)

Unnamed: 0,area_id,area_name,parent_area_id
0,1,New South Wales,0
1,10,Greater Sydney,1


In [61]:
nbhd_df = pd.read_csv('./data/Neighbourhoods.csv')
nbhd_df.head(1)

Unnamed: 0,area_id,area_name,land_area,population,number_of_dwellings,number_of_businesses,median_annual_household_income,avg_monthly_rent
0,102011028,Avoca Beach - Copacabana,643.8,7590,2325,738.0,46996.0,1906.0


In [62]:
busi_stat_df = pd.read_csv('./data/BusinessStats.csv')
busi_stat_df.head(1)

Unnamed: 0,area_id,area_name,number_of_businesses,accommodation_and_food_services,retail_trade,agriculture_forestry_and_fishing,health_care_and_social_assistance,public_administration_and_safety,transport_postal_and_warehousing
0,101021007,Braidwood,629,26,27,280,11,0,35


In [70]:
rfs_gdf = gpd.read_file('./data/RFSNSW_BFPL/RFSNSW_BFPL.shp')
print(rfs_gdf.crs) # Check EPSG / CRS -- 4283 = GDA94
rfs_gdf.columns = [x.lower() for x in rfs_gdf.columns] # lower case col names
rfs_gdf.head(1)

epsg:4283


Unnamed: 0,category,shape_leng,shape_area,geometry
0,1,1.7e-05,5.3924e-12,POINT (149.11319 -33.05824)


In [71]:
sa2_gdf = gpd.read_file('./data/1270055001_sa2_2016_aust_shape/SA2_2016_AUST.shp')
print(sa2_gdf.crs) # Check EPSG / CRS -- 4283 = GDA94
sa2_gdf.columns = [x.lower() for x in sa2_gdf.columns] # lower case col names
sa2_gdf.head(1)

epsg:4283


Unnamed: 0,sa2_main16,sa2_5dig16,sa2_name16,sa3_code16,sa3_name16,sa4_code16,sa4_name16,gcc_code16,gcc_name16,ste_code16,ste_name16,areasqkm16,geometry
0,101021007,11007,Braidwood,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,3418.3525,"POLYGON ((149.58423 -35.44427, 149.58444 -35.4..."


In [76]:
set(sa2_gdf.geometry.type)

{'MultiPolygon', None, 'Polygon'}

### Creation of database and tables

In [13]:
# Function for accessing Postgres DB (SOURCE: DATA2001 Lab materials)

def pgconnect_using_credfile(credential_filepath):
    try:
        with open(credential_filepath) as f:
            db_conn_dict = json.load(f)
        connstring = 'postgres+psycopg2://'+db_conn_dict['user']+':'+db_conn_dict['password']+'@'+db_conn_dict['host']+'/'+db_conn_dict['database']
        db = create_engine(connstring, echo=False)
        conn = db.connect()
        print('connected')
    except Exception as e:
        print("unable to connect to the database")
        print(e)
        return None
    return db,conn

print("function setup successful.")

function setup successful.


In [14]:
# Connect to University server student Postgres DB

credfilepath = './data2x01_db.json' # Not tracked on Git, must be locally available

db, conn = pgconnect_using_credfile(credfilepath)

connected


In [15]:
# Function for querying the PostgreSQL DB.
# Returns value and a converted dataframe (SOURCE: DATA2001 Lab materials)

def pgquery( conn, sqlcmd, args=None, silent=False ):
    """ utility function to execute some SQL query statement
    can take optional arguments to fill in (dictionary)
    will print out on screen the result set of the query
    error and transaction handling built-in """
    retdf = pd.DataFrame()
    retval = False
    try:
        if args is None:
            retdf = pd.read_sql_query(sqlcmd,conn)
        else:
            retdf = pd.read_sql_query(sqlcmd,conn,params=args)
        if silent == False:
            print(retdf.shape)
            print(retdf.to_string())
        retval = True
    except Exception as e:
        if silent == False:
            print("db read error: ")
            print(e)
    return retval,retdf

print("function setup successful.")

function setup successful.


In [16]:
# Checking we have PostGIS working on our connection (SOURCE: DATA2001 Lab materials)

postgis_check = '''
SELECT PostGIS_Version();
'''

retval,retdf = pgquery(conn,postgis_check)
retdf

(1, 1)
                         postgis_version
0  3.0 USE_GEOS=1 USE_PROJ=1 USE_STATS=1


Unnamed: 0,postgis_version
0,3.0 USE_GEOS=1 USE_PROJ=1 USE_STATS=1


### Creation of Tables (Dataframes)

In [47]:
neighbourhoods_create = '''CREATE TABLE neighbourhoods (
                     var_id CHAR(9),
                     area_name VARCHAR(100),
                     land_area FLOAT,
                     population NUMERIC,
                     number_of_dwellings NUMERIC,
                     number_of_businesses NUMERIC,
                     median_annual_household_income NUMERIC,
                     avg_monthly_rent NUMERIC
                     )'''

conn.execute("DROP TABLE IF EXISTS neighbourhoods")
conn.execute(neighbourhoods_create)

<sqlalchemy.engine.result.ResultProxy at 0x11c71d2d0>

In [48]:
nbhd_df.to_sql('neighbourhoods', con = conn, if_exists = 'replace', index=False)

In [49]:
# Check table
a_response, a_df = pgquery(conn, """SELECT * FROM neighbourhoods
LIMIT 1;""")
a_df.head()

(1, 8)
     area_id                 area_name  land_area population number_of_dwellings  number_of_businesses  median_annual_household_income  avg_monthly_rent
0  102011028  Avoca Beach - Copacabana      643.8       7590                2325                 738.0                         46996.0            1906.0


Unnamed: 0,area_id,area_name,land_area,population,number_of_dwellings,number_of_businesses,median_annual_household_income,avg_monthly_rent
0,102011028,Avoca Beach - Copacabana,643.8,7590,2325,738.0,46996.0,1906.0


In [50]:
business_create = '''CREATE TABLE business (
                     var_id CHAR(9),
                     area_name VARCHAR(100),
                     number_of_businesses NUMERIC,
                     accommodation_and_food_services NUMERIC,
                     retail_trade NUMERIC,
                     agriculture_forestry_and_fishing NUMERIC,
                     health_care_and_social_assistance NUMERIC,
                     public_administration_and_safety NUMERIC,
                     transport_postal_and_warehousing NUMERIC
                     )'''

conn.execute("DROP TABLE IF EXISTS business")
conn.execute(business_create)

<sqlalchemy.engine.result.ResultProxy at 0x11c728150>

In [51]:
busi_stat_df.to_sql('business', con = conn, if_exists = 'replace', index=False)

In [52]:
# Check table
a_response, a_df = pgquery(conn, """SELECT * FROM business
LIMIT 1;""")
a_df.head()

(1, 9)
     area_id  area_name  number_of_businesses  accommodation_and_food_services  retail_trade  agriculture_forestry_and_fishing  health_care_and_social_assistance  public_administration_and_safety  transport_postal_and_warehousing
0  101021007  Braidwood                   629                               26            27                               280                                 11                                 0                                35


Unnamed: 0,area_id,area_name,number_of_businesses,accommodation_and_food_services,retail_trade,agriculture_forestry_and_fishing,health_care_and_social_assistance,public_administration_and_safety,transport_postal_and_warehousing
0,101021007,Braidwood,629,26,27,280,11,0,35


In [53]:
stat_areas_create = '''CREATE TABLE stat_areas (
                     area_id VARCHAR(9),
                     area_name VARCHAR(100),
                     parent_area_id VARCHAR(9)
                     )'''

conn.execute("DROP TABLE IF EXISTS stat_areas")
conn.execute(stat_areas_create)

<sqlalchemy.engine.result.ResultProxy at 0x11c72d350>

In [64]:
stat_areas_df.to_sql('stat_areas', con = conn, if_exists = 'replace', index=False)

In [65]:
# Check table
a_response, a_df = pgquery(conn, """SELECT * FROM stat_areas
LIMIT 1;""")
a_df.head()

(1, 3)
   area_id        area_name  parent_area_id
0        1  New South Wales               0


Unnamed: 0,area_id,area_name,parent_area_id
0,1,New South Wales,0


### Creation of Tables (GeoDataframes)

In [73]:
rfs_bushfire_create = '''CREATE TABLE rfs_bushfire (
                     category CHAR(1),
                     shape_leng FLOAT,
                     shape_area FLOAT,
                     geom GEOMETRY(POINT, 4283)
                     )'''

conn.execute("DROP TABLE IF EXISTS rfs_bushfire")
conn.execute(rfs_bushfire_create)

<sqlalchemy.engine.result.ResultProxy at 0x11d2cf750>

In [None]:
# TO-DO: LOAD DATA INTO THIS TABLE WITH APPROPRIATE WKT CONVERSION PROCESS

In [80]:
sa2_shape_create = '''CREATE TABLE sa2_shape (
                     sa2_main16 VARCHAR(80),
                     sa2_5dig16 VARCHAR(80),
                     sa2_name16 VARCHAR(100),
                     sa3_code16 VARCHAR(80),
                     sa3_name16 VARCHAR(100),
                     sa4_code16 VARCHAR(80),
                     sa4_name16 VARCHAR(100),
                     gcc_code16 VARCHAR(80),
                     gcc_name16 VARCHAR(100),
                     ste_code16 VARCHAR(80),
                     ste_name16 VARCHAR(100),
                     areasqkm16 FLOAT,
                     geom GEOMETRY(MULTIPOLYGON, 4283)
                     )'''

conn.execute("DROP TABLE IF EXISTS sa2_shape")
conn.execute(sa2_shape_create)

<sqlalchemy.engine.result.ResultProxy at 0x11d474890>

In [None]:
# TO-DO: LOAD DATA INTO THIS TABLE WITH APPROPRIATE WKT CONVERSION PROCESS

In [8]:
# Disconnect from DB

conn.close()
db.dispose()
print("disconnected")

disconnected


## Task 2: Fire Risk Analysis

### Fire Risk Score

$$fire_risk = S(z(population_density)+z(dwelling_&_business_density)+z(bfpl_density)−z(assistive_service_density))$$

In [21]:
#Z-score
def z(x, avg, sd):
    return((x-avg)/sd)

#Sigmoidal function. Did not use native exponential because fails for large negative values.
def sigmoid(x):
    return(1/(1+np.exp(-x)))

#Fire risk score formula.
def fire_risk(pop_d, dwell_bus_d, bfpl_d, ass_serv_d):
    fire_risk_score = sigmoid(z(pop_d)+z(dwell_bus_d)+z(bfpl_d)-z(ass_serv_d))
    return fire_risk_score