In [85]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon, MultiPolygon
from geoalchemy2 import Geometry, WKTElement
import matplotlib.pyplot as plt
import numpy as np
from sqlalchemy.types import String, Integer
from sqlalchemy import create_engine
from sqlalchemy import text
import psycopg2
import psycopg2.extras
import json

import geoalchemy2 
import scipy.stats as stats
import math
import plotly.express as px




In [2]:
credentials = "Credentials.json"

def pgconnect(credential_filepath, db_schema="public"):
    with open(credential_filepath) as f:
        db_conn_dict = json.load(f)
        host       = db_conn_dict['host']
        db_user    = db_conn_dict['user']
        db_pw      = db_conn_dict['password']
        default_db = db_conn_dict['database']
        try:
            db = create_engine('postgresql+psycopg2://'+db_user+':'+db_pw+'@'+host+'/'+default_db, echo=False)
            conn = db.connect()
            print('Connected successfully.')
        except Exception as e:
            print("Unable to connect to the database.")
            print(e)
            db, conn = None, None
        return db,conn

def query(conn, sqlcmd, args=None, df=True):
    result = pd.DataFrame() if df else None
    try:
        if df:
            result = pd.read_sql_query(sqlcmd, conn, params=args)
        else:
            result = conn.execute(sqlcmd, args).fetchall()
            result = result[0] if len(result) == 1 else result
    except Exception as e:
        print("Error encountered: ", e, sep='\n')
    return result

db, conn = pgconnect(credentials)

Connected successfully.


In [3]:
query(conn, "CREATE EXTENSION postgis;")
query(conn, "select PostGIS_Version()")


Error encountered: 
(psycopg2.errors.DuplicateObject) extension "postgis" already exists

[SQL: CREATE EXTENSION postgis;]
(Background on this error at: http://sqlalche.me/e/14/f405)


Unnamed: 0,postgis_version
0,3.3 USE_GEOS=1 USE_PROJ=1 USE_STATS=1


In [4]:
GDA2020 = gpd.read_file('SA2_2021_AUST_SHP_GDA2020/SA2_2021_AUST_GDA2020.shp', crs='EPSG:4326')
GDA2020 = GDA2020[GDA2020['GCC_NAME21'] == 'Greater Sydney']
GDA2020 = GDA2020.loc[:, ['SA2_CODE21','SA2_NAME21','geometry']]

def create_wkt_element(geom, srid):
    if geom.geom_type == 'Polygon':
        geom = MultiPolygon([geom])
    return WKTElement(geom.wkt, srid)



In [5]:
GDA2020

Unnamed: 0,SA2_CODE21,SA2_NAME21,geometry
28,102011028,Avoca Beach - Copacabana,"POLYGON ((151.41373 -33.46558, 151.41362 -33.4..."
29,102011029,Box Head - MacMasters Beach,"POLYGON ((151.37484 -33.50052, 151.37507 -33.5..."
30,102011030,Calga - Kulnura,"MULTIPOLYGON (((151.20449 -33.53280, 151.20448..."
31,102011031,Erina - Green Point,"POLYGON ((151.37194 -33.43698, 151.37288 -33.4..."
32,102011032,Gosford - Springfield,"POLYGON ((151.32349 -33.42779, 151.32342 -33.4..."
...,...,...,...
637,128021537,Royal National Park,"POLYGON ((151.07363 -34.05638, 151.07360 -34.0..."
638,128021538,Sutherland - Kirrawee,"POLYGON ((151.05006 -34.02158, 151.05008 -34.0..."
639,128021607,Engadine,"POLYGON ((150.99568 -34.05361, 150.99570 -34.0..."
640,128021608,Loftus - Yarrawarrah,"POLYGON ((151.03955 -34.04175, 151.03954 -34.0..."


In [6]:
srid = 4326
GDA2020og = GDA2020.copy()  # creating a copy of the original for later
GDA2020['geom'] = GDA2020['geometry'].apply(lambda x: create_wkt_element(geom=x,srid=srid))  # applying the function
GDA2020 = GDA2020.drop(columns="geometry")  # deleting the old copy

In [7]:
GDA2020

Unnamed: 0,SA2_CODE21,SA2_NAME21,geom
28,102011028,Avoca Beach - Copacabana,MULTIPOLYGON (((151.413733024921 -33.465580583...
29,102011029,Box Head - MacMasters Beach,MULTIPOLYGON (((151.37484081570685 -33.5005199...
30,102011030,Calga - Kulnura,MULTIPOLYGON (((151.20449037540152 -33.5328022...
31,102011031,Erina - Green Point,MULTIPOLYGON (((151.37193611462118 -33.4369790...
32,102011032,Gosford - Springfield,MULTIPOLYGON (((151.32348639265098 -33.4277852...
...,...,...,...
637,128021537,Royal National Park,MULTIPOLYGON (((151.07362997413264 -34.0563789...
638,128021538,Sutherland - Kirrawee,MULTIPOLYGON (((151.05006441218998 -34.0215774...
639,128021607,Engadine,MULTIPOLYGON (((150.99568346574816 -34.0536082...
640,128021608,Loftus - Yarrawarrah,MULTIPOLYGON (((151.03954821100714 -34.0417452...


In [8]:

statement = """
    DROP TABLE IF EXISTS GDA2020;
    CREATE TABLE GDA2020 (
        SA2_CODE21 VARCHAR(9),
        SA2_NAME21 text,
        geom GEOMETRY(MULTIPOLYGON,4326)
    );
"""


result = query(conn, statement)
print(result) 


Error encountered: 
This result object does not return rows. It has been closed automatically.
Empty DataFrame
Columns: []
Index: []


In [9]:
GDA2020.to_sql('GDA2020', conn, schema='public',method=None,if_exists='append', index=False, dtype={'geom': Geometry('MULTIPOLYGON', srid)})



373

In [10]:
#stop
scrid = 4326
stops = pd.read_csv("Stops.txt", sep=",")
stops['geom'] = gpd.points_from_xy(stops.stop_lon, stops.stop_lat)
stops['geom'] = stops['geom'].apply(lambda x: WKTElement(x.wkt, srid=srid))

columns = ['stop_id', 'geom']
stops= stops.loc[:, columns]



print(stops)

stops.to_sql('stopschema', conn, if_exists='append',schema='public', method=None,index=False, dtype={'stop_id': String, 'geom': Geometry('POINT', srid)})


        stop_id                                        geom
0        200039   POINT (151.20666465471 -33.8822064874687)
1        200054   POINT (151.20699145565 -33.8820421431408)
2        200060  POINT (151.206292455081 -33.8840842535493)
3        201510  POINT (151.198866071817 -33.8916900512711)
4        201646  POINT (151.198881722942 -33.8933293130144)
...         ...                                         ...
114713   212753   POINT (151.07879697831 -33.8220164586429)
114714  2137185  POINT (151.116926480557 -33.8406690716775)
114715  2137186  POINT (151.116898892402 -33.8407691073139)
114716    21501  POINT (151.010576673346 -33.8139042429414)
114717  2150112  POINT (151.010481768913 -33.8139523874985)

[114718 rows x 2 columns]


718

In [11]:
conn.execute(text("""
    DROP INDEX IF EXISTS indexgda;
    CREATE INDEX indexgda ON public."GDA2020" USING GIST(geom);
"""));
conn.execute(text("""
    DROP INDEX IF EXISTS indexstops;
    CREATE INDEX indexstops ON public."GDA2020" USING GIST(geom);
"""));

In [12]:
result = query(conn, text("""
SELECT g."SA2_NAME21", COUNT(*) AS count, (COUNT(*) - avg_count) / stddev_count AS z_score
FROM public."GDA2020" g
JOIN public."stopschema" s ON ST_Contains(g.geom, s.geom)
CROSS JOIN (
SELECT AVG(count) AS avg_count,
STDDEV(count) AS stddev_count
FROM (
SELECT g."SA2_NAME21", COUNT(*) AS count
FROM public."GDA2020" g
JOIN public."stopschema" s ON ST_Contains(g.geom, s.geom)
GROUP BY g."SA2_NAME21") AS subquery) AS subquery2
GROUP BY g."SA2_NAME21", avg_count, stddev_count
ORDER BY count DESC;
"""))

stopfinal = pd.DataFrame(result, columns=["SA2_NAME21", "z_score"])
print(stopfinal)

                             SA2_NAME21   z_score
0    Dural - Kenthurst - Wisemans Ferry  6.337754
1                 Springwood - Winmalee  3.303507
2                      Katoomba - Leura  2.840263
3          Umina - Booker Bay - Patonga  2.747615
4               Campbelltown - Woodbine  2.608641
..                                  ...       ...
367                         Wolli Creek -1.595296
368                         Chippendale -1.595296
369                      Badgerys Creek -1.618458
370              Blue Mountains - North -1.687944
371              Blue Mountains - South -1.711107

[372 rows x 2 columns]


In [13]:
#polls
polls = pd.read_csv('PollingPlaces2019.csv', usecols = ['polling_place_id','the_geom','longitude','latitude'])
polls.dropna(subset=['the_geom'], inplace=True)
polls['geom'] = gpd.points_from_xy(polls.longitude, polls.latitude)
polls['geom'] = polls['geom'].apply(lambda x: WKTElement(x.wkt, srid=srid))
polls = polls.loc[:, ['polling_place_id','geom']]


print(polls)



      polling_place_id                             geom
13                  58         POINT (151.081 -33.9847)
15                 392         POINT (150.817 -33.7475)
16                  31  POINT (151.1148974 -33.9767897)
17                  67         POINT (151.111 -33.9756)
18               56500         POINT (151.075 -33.9413)
...                ...                              ...
2924              2810      POINT (150.85177 -34.54724)
2925              2809         POINT (150.858 -34.5642)
2926             58798  POINT (150.8597546 -34.5508228)
2927             31242         POINT (150.424 -34.4409)
2928               564         POINT (150.866 -34.5316)

[2790 rows x 2 columns]


In [14]:


statement = """
DROP TABLE IF EXISTS polls;
CREATE TABLE polls (
    polling_place_id integer,
    geom GEOMETRY(POINT,4326));
"""


result = query(conn, statement)
polls.to_sql('polls', conn, if_exists='append', index=False, dtype={'polling_place_id': Integer, 'geom': Geometry('POINT', srid)})


Error encountered: 
This result object does not return rows. It has been closed automatically.


790

In [15]:
result = query(conn, text("""
WITH subquery AS (
    SELECT g."SA2_NAME21", COUNT(*) AS count
    FROM public."GDA2020" G
    JOIN public."polls" p ON ST_Contains(G.geom, p.geom)
    GROUP BY g."SA2_NAME21"
)
SELECT subquery."SA2_NAME21", subquery.count, (subquery.count - avg_value) / std_deviation AS z_score
FROM subquery
CROSS JOIN (
    SELECT AVG(subquery.count) AS avg_value, STDDEV(subquery.count) AS std_deviation
    FROM subquery
) AS subquery2
ORDER BY subquery.count DESC;

"""))

pollfinal = pd.DataFrame(result, columns=["SA2_NAME21", "z_score"])

print(pollfinal)

                         SA2_NAME21    z_score
0    Sydney (North) - Millers Point  14.476684
1        Sydney (South) - Haymarket   5.504492
2                Parramatta - North   3.322067
3                  Chatswood - East   2.837084
4                           Penrith   2.109609
..                              ...        ...
348                         Pyrmont  -0.800291
349              Canterbury - South  -0.800291
350                          Putney  -0.800291
351                  Bexley - North  -0.800291
352                          Berala  -0.800291

[353 rows x 2 columns]


In [16]:
school_future = gpd.read_file('catchments/catchments_future.shp')
school_primary = gpd.read_file('catchments/catchments_primary.shp')
school_secondary = gpd.read_file('catchments/catchments_secondary.shp')
datasets = [school_future,school_primary,school_secondary]

scrid = 4326
for i in range(len(datasets)):
    datasets[i].columns = datasets[i].columns.str.strip()
    datasets[i]['geom'] = datasets[i]['geometry'].apply(lambda x: create_wkt_element(geom=x, srid=srid))


In [17]:
school_future = school_future[['geom']]
school_primary = school_primary[['geom']]
school_secondary = school_secondary[['geom']]


print(len(school_future))

30


In [18]:
ypopulation = pd.read_csv('Population.csv')
ypopulation['young_people'] = ypopulation['0-4_people'] + ypopulation['5-9_people'] + ypopulation['10-14_people'] + ypopulation['15-19_people']
ypopulation = ypopulation.loc[:, ['sa2_name','young_people']]

print(ypopulation)

                        sa2_name  young_people
0       Avoca Beach - Copacabana          2121
1    Box Head - MacMasters Beach          2471
2                Calga - Kulnura           961
3            Erina - Green Point          3205
4          Gosford - Springfield          4364
..                           ...           ...
368          Royal National Park            20
369        Sutherland - Kirrawee          5078
370                     Engadine          5118
371         Loftus - Yarrawarrah          2073
372             Woronora Heights           965

[373 rows x 2 columns]


In [19]:
statement = """
DROP TABLE IF EXISTS primarys;
CREATE TABLE primarys (
    geom GEOMETRY(MULTIPOLYGON,4326));
"""
result = query(conn, statement)
school_primary.to_sql('primarys', conn, schema='public',method=None,if_exists='append', index=False, dtype={'geom': Geometry('MULTIPOLYGON', srid)})


Error encountered: 
This result object does not return rows. It has been closed automatically.


662

In [20]:
statement = """
DROP TABLE IF EXISTS secondarys;
CREATE TABLE secondarys (
    geom GEOMETRY(MULTIPOLYGON,4326));
"""
result = query(conn, statement)

school_secondary.to_sql('secondarys', conn, schema='public',method=None,if_exists='append', index=False, dtype={'geom': Geometry('MULTIPOLYGON', srid)})


Error encountered: 
This result object does not return rows. It has been closed automatically.


436

In [21]:

statement = """
DROP TABLE IF EXISTS future;
CREATE TABLE future (
    geom GEOMETRY(MULTIPOLYGON,4326));
"""
result = query(conn, statement)
school_future.to_sql('future', conn, schema='public',method=None,if_exists='append', index=False, dtype={'geom': Geometry('MULTIPOLYGON', srid)})


Error encountered: 
This result object does not return rows. It has been closed automatically.


30

In [22]:
result = query(conn, text("""
    SELECT g."SA2_NAME21", COUNT(*)
    FROM public."GDA2020" g
        JOIN public."primarys" p ON ST_Intersects(g.geom, p.geom)
    GROUP BY g."SA2_NAME21"
    ORDER BY count DESC
"""))
primdata = pd.DataFrame(result, columns=["SA2_NAME21", "count"])



In [23]:
result = query(conn, text("""
    SELECT g."SA2_NAME21", COUNT(*)
    FROM public."GDA2020" g
        JOIN public."secondarys" p ON ST_Intersects(g.geom, p.geom)
    GROUP BY g."SA2_NAME21"
    ORDER BY count DESC
"""))

print(result)

seconddata = pd.DataFrame(result, columns=["SA2_NAME21", "count"])


                             SA2_NAME21  count
0    Dural - Kenthurst - Wisemans Ferry     44
1                            Turramurra     44
2                               Padstow     40
3                             Riverwood     40
4                Blue Mountains - South     40
..                                  ...    ...
368                     Wentworth Falls      4
369                               Bargo      4
370           Bondi Beach - North Bondi      4
371                       Dover Heights      4
372                     Dee Why - North      4

[373 rows x 2 columns]


In [24]:
result = query(conn, text("""
    SELECT g."SA2_NAME21", COUNT(*)
    FROM public."GDA2020" g
        JOIN public."future" p ON ST_Intersects(g.geom, p.geom)
    GROUP BY g."SA2_NAME21"
    ORDER BY count DESC
"""))


futuredata = pd.DataFrame(result, columns=["SA2_NAME21", "count"])

print(result)

                          SA2_NAME21  count
0    Gledswood Hills - Gregory Hills     24
1                            St Ives     16
2                   Kensington (NSW)     16
3                   Randwick - North     16
4                           Waterloo     16
..                               ...    ...
98                       Surry Hills      4
99      Sydenham - Tempe - St Peters      4
100   Toongabbie - Constitution Hill      4
101                Toongabbie - West      4
102   West Hoxton - Middleton Grange      4

[103 rows x 2 columns]


In [25]:
ypopulation = ypopulation.rename(columns={'sa2_name': 'SA2_NAME21'})

concatdf = pd.concat([futuredata, seconddata, primdata], ignore_index=True)

concatdf = concatdf.groupby('SA2_NAME21')['count'].sum().reset_index()

concatdf = pd.concat([ypopulation, concatdf], ignore_index=True)

concatdf = concatdf.groupby('SA2_NAME21').sum().reset_index()

concatdf['rate'] = (concatdf['count'] / concatdf['young_people']) * 1000
concatdf = concatdf.drop(concatdf[concatdf['rate'] == 0].index)
concatdf = concatdf.drop(concatdf[concatdf['young_people'] == 0].index)

concatdf = concatdf.reset_index(drop=True)
pd.set_option("display.max_rows", None)  # To display all rows
pd.set_option("display.max_columns", None)  # To display all columns
concatdf['z scores'] = stats.zscore(concatdf['rate'])
print(concatdf)




                                      SA2_NAME21  young_people  count  \
0                                 Acacia Gardens        1062.0   20.0   
1                                Annandale (NSW)        1947.0   36.0   
2                    Arncliffe - Bardwell Valley        3501.0   56.0   
3                                       Artarmon        2290.0   36.0   
4                      Ashcroft - Busby - Miller        5850.0   64.0   
5                               Ashfield - North        1755.0   24.0   
6                               Ashfield - South        1893.0   44.0   
7                          Asquith - Mount Colah        6041.0   72.0   
8                               Auburn - Central        4323.0   48.0   
9                                 Auburn - North        1926.0   36.0   
10                                Auburn - South        2510.0   56.0   
11                           Austral - Greendale        3527.0   88.0   
12                           Avalon - Palm Beach   

In [26]:
business = pd.read_csv('Businesses.csv')
retail = business.loc[business['industry_name'] == 'Retail Trade', ['industry_name', 'sa2_code', 'total_businesses']]
health = business.loc[business['industry_name'] == 'Health Care and Social Assistance', ['industry_name', 'sa2_code','total_businesses']]
population = pd.read_csv('Population.csv', usecols = ['sa2_code', 'sa2_name','total_people'])

print(population)




      sa2_code                                     sa2_name  total_people
0    102011028                     Avoca Beach - Copacabana          7530
1    102011029                  Box Head - MacMasters Beach         11052
2    102011030                              Calga - Kulnura          4748
3    102011031                          Erina - Green Point         14803
4    102011032                        Gosford - Springfield         21346
5    102011033                                      Kariong          6518
6    102011034                  Kincumber - Picketts Valley          7628
7    102011035                                       Narara          7191
8    102011036                       Niagara Park - Lisarow          8237
9    102011037                      Point Clare - Koolewong          6575
10   102011038                         Saratoga - Davistown          7179
11   102011039                       Terrigal - North Avoca         14890
12   102011040                 Umina -

In [27]:
#population

statement = """
DROP TABLE IF EXISTS population;
CREATE TABLE population (
    sa2_code integer,
    sa2_name text,
    total_people integer   
    );
"""

result = query(conn, statement)
population.to_sql('population', conn, if_exists='append',schema='public', method=None,index=False)


Error encountered: 
This result object does not return rows. It has been closed automatically.


373

In [28]:
#health

statement = """
DROP TABLE IF EXISTS health;
CREATE TABLE health (
    industry_name text,
    sa2_code integer,
    total_businesses integer   
    );
"""
result = query(conn, statement)

health.to_sql('health', conn, if_exists='append',schema='public', method=None,index=False)


Error encountered: 
This result object does not return rows. It has been closed automatically.


643

In [29]:
#retail

statement = """
DROP TABLE IF EXISTS retail;
CREATE TABLE retail (
    industry_name text,
    sa2_code integer,
    total_businesses integer   
    );
"""
result = query(conn, statement)

retail.to_sql('retail', conn, if_exists='append',schema='public', method=None,index=False)




Error encountered: 
This result object does not return rows. It has been closed automatically.


643

In [30]:
#retail z score
result = query(conn, text("""
SELECT sa2_name, ROUND(CAST(r.total_businesses AS DECIMAL(10,2))/CAST(total_people AS DECIMAL(10,2)) * 1000, 5) as rper1000
FROM public."retail" r
INNER JOIN public."population" p ON p."sa2_code" = r."sa2_code"
WHERE total_people != 0;
"""))

retailz = pd.DataFrame(result, columns=["sa2_name", "rper1000"])
retailz['z scores retail'] = stats.zscore(retailz["rper1000"])
print(retailz)


                                        sa2_name    rper1000  z scores retail
0                       Avoca Beach - Copacabana     5.97610        -0.109739
1                    Box Head - MacMasters Beach     4.52407        -0.115541
2                                Calga - Kulnura    12.00505        -0.085650
3                            Erina - Green Point    10.26819        -0.092589
4                          Gosford - Springfield     8.43249        -0.099924
5                                        Kariong     3.06843        -0.121357
6                    Kincumber - Picketts Valley     5.50603        -0.111617
7                                         Narara     1.80782        -0.126394
8                         Niagara Park - Lisarow     3.52070        -0.119550
9                        Point Clare - Koolewong     3.04183        -0.121463
10                          Saratoga - Davistown     4.31815        -0.116364
11                        Terrigal - North Avoca     4.96978    

371                             Woronora Heights     2.53450        -0.123491


In [31]:
#health z score
result = query(conn, text("""
SELECT sa2_name, ROUND(CAST(h.total_businesses AS DECIMAL(10,2))/CAST(total_people AS DECIMAL(10,2)) * 1000, 5) as hper1000
FROM public.health h
INNER JOIN public.population p ON p.sa2_code = h.sa2_code
WHERE total_people != 0;
"""))

healthz = pd.DataFrame(result, columns=["sa2_name", "hper1000"])
healthz['z scores health'] = stats.zscore(healthz["hper1000"])
print(healthz)

                                        sa2_name    hper1000  z scores health
0                       Avoca Beach - Copacabana     9.82736        -0.047607
1                    Box Head - MacMasters Beach     4.97647        -0.079456
2                                Calga - Kulnura     9.05644        -0.052668
3                            Erina - Green Point    13.71344        -0.022092
4                          Gosford - Springfield    14.19470        -0.018932
5                                        Kariong     3.37527        -0.089969
6                    Kincumber - Picketts Valley     7.73466        -0.061347
7                                         Narara     3.33751        -0.090217
8                         Niagara Park - Lisarow     2.67088        -0.094594
9                        Point Clare - Koolewong     5.32319        -0.077180
10                          Saratoga - Davistown     3.62167        -0.088351
11                        Terrigal - North Avoca    11.61854    

371                             Woronora Heights     4.22416        -0.084396


In [32]:

healthz = healthz[['sa2_name', 'z scores health']]
retailz = retailz[['sa2_name', 'z scores retail']]
concatdf = concatdf[['SA2_NAME21', 'z scores']]

In [42]:
dataframes = [retailz, concatdf, stopfinal, pollfinal, healthz]

for array in dataframes:
    array.rename(columns={array.columns[0]: 'sa2_name', array.columns[1]: 'z scores'}, inplace=True)



In [53]:
finalzscores = healthz[['sa2_name']]

for df in dataframes:
    finalzscores = finalzscores.merge(df[['sa2_name', 'z scores']], on='sa2_name', how='outer')

  finalzscores = finalzscores.merge(df[['sa2_name', 'z scores']], on='sa2_name', how='outer')


In [54]:

finalzscores = finalzscores.dropna()
finalzscores

Unnamed: 0,sa2_name,z scores_x,z scores_y,z scores_x.1,z scores_y.1,z scores
0,Avoca Beach - Copacabana,-0.109739,-0.131228,-0.26347,-0.5578,-0.047607
1,Box Head - MacMasters Beach,-0.115541,-0.131223,0.755666,-0.5578,-0.079456
2,Calga - Kulnura,-0.08565,-0.100508,0.257679,0.654659,-0.052668
3,Erina - Green Point,-0.092589,-0.128089,1.566343,0.412167,-0.022092
4,Gosford - Springfield,-0.099924,-0.130747,2.37702,0.654659,-0.018932
5,Kariong,-0.121357,-0.132254,-0.981498,-0.5578,-0.089969
6,Kincumber - Picketts Valley,-0.111617,-0.130536,-0.112916,-0.072816,-0.061347
7,Narara,-0.126394,-0.1303,-0.529835,-0.800291,-0.090217
8,Niagara Park - Lisarow,-0.11955,-0.128194,-0.541416,-0.800291,-0.094594
9,Point Clare - Koolewong,-0.121463,-0.131458,-0.089753,-0.5578,-0.07718


In [48]:
finalzscores.to_csv('finalzscores.csv', index=False)

In [61]:
#sigmoid
def sigmoid(table):
    return 1 / (1 + np.exp(-table))
finalzscores.loc[:,'Total'] = finalzscores.sum(axis=1)
finalzscores['Sigmoid'] = ''
for index, row in finalzscores.iterrows():
    table = row['Total']
    finalzscores.loc[index, 'Sigmoid'] = sigmoid(table)
 
finalzscores

  finalzscores.loc[:,'Total'] = finalzscores.sum(axis=1)


Unnamed: 0,sa2_name,z scores_x,z scores_y,z scores_x.1,z scores_y.1,z scores,Total,Sigmoid
0,Avoca Beach - Copacabana,-0.109739,-0.131228,-0.26347,-0.5578,-0.047607,-2.219687,0.097996
1,Box Head - MacMasters Beach,-0.115541,-0.131223,0.755666,-0.5578,-0.079456,-0.256707,0.436173
2,Calga - Kulnura,-0.08565,-0.100508,0.257679,0.654659,-0.052668,1.347024,0.793643
3,Erina - Green Point,-0.092589,-0.128089,1.566343,0.412167,-0.022092,3.471478,0.969865
4,Gosford - Springfield,-0.099924,-0.130747,2.37702,0.654659,-0.018932,5.564149,0.996182
5,Kariong,-0.121357,-0.132254,-0.981498,-0.5578,-0.089969,-3.765754,0.022626
6,Kincumber - Picketts Valley,-0.111617,-0.130536,-0.112916,-0.072816,-0.061347,-0.978464,0.273197
7,Narara,-0.126394,-0.1303,-0.529835,-0.800291,-0.090217,-3.354076,0.033762
8,Niagara Park - Lisarow,-0.11955,-0.128194,-0.541416,-0.800291,-0.094594,-3.36809,0.033308
9,Point Clare - Koolewong,-0.121463,-0.131458,-0.089753,-0.5578,-0.07718,-1.955309,0.123976


In [70]:
#income

income = pd.read_csv('Income.csv', usecols = ['sa2_name', 'median_income' ])


correlationdf = finalzscores.merge(income[['sa2_name', 'median_income']], on='sa2_name', how='outer')

correlationdf= correlationdf.dropna()

correlationdf = correlationdf[['sa2_name','Sigmoid','median_income']]

correlationdf

Unnamed: 0,sa2_name,Sigmoid,median_income
0,Avoca Beach - Copacabana,0.097996,52450
1,Box Head - MacMasters Beach,0.436173,48724
2,Calga - Kulnura,0.793643,46228
3,Erina - Green Point,0.969865,48292
4,Gosford - Springfield,0.996182,51999
5,Kariong,0.022626,54900
6,Kincumber - Picketts Valley,0.273197,50106
7,Narara,0.033762,53262
8,Niagara Park - Lisarow,0.033308,52812
9,Point Clare - Koolewong,0.123976,51099


In [79]:
correlationdf['Sigmoid'] = correlationdf['Sigmoid'].astype(float)
correlationdf['median_income'] = correlationdf['median_income'].astype(float)

sa2_name          object
Sigmoid          float64
median_income    float64
dtype: object


In [86]:
#correlation graph
graph = px.scatter(correlationdf, title='Income vs Sigmoid value', x='median_income', y='Sigmoid', color='median_income', trendline='ols', height=750)
graph.show()