In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon, MultiPolygon
from geoalchemy2 import Geometry, WKTElement
import matplotlib.pyplot as plt
from sqlalchemy import text

## Data Loading

In [None]:
businesses = pd.read_csv("Businesses.csv")
income = pd.read_csv("Income.csv")
population = pd.read_csv("Population.csv")
stops = pd.read_csv("Stops.txt")

In [None]:
businesses.info()
income.info()
population.info()
stops.info() 
# The stops and income df contains null values. Income has null values in form of 'np'. 

In [None]:
# schools dataset
catchments_primary = gpd.read_file("Catchments/catchments_primary.shp")
catchments_future = gpd.read_file("Catchments/catchments_future.shp")
catchments_secondary = gpd.read_file("Catchments/catchments_secondary.shp")

In [None]:
catchments_primary.info()
# Add_DATE and PRIORITY contain nulls

In [None]:
catchments_future.info()

In [None]:
catchments_secondary.info()

In [None]:
sa2_regions = gpd.read_file("SA2_2021_SHP/SA2_2021_AUST_GDA2020.shp")

In [None]:
sa2_regions.info()
#geometry and AREASQKM21 contain null values

## Dataset Cleaning

In [None]:
# clean up np values in income df 
income = income[income['median_income'] != 'np']
income

In [None]:
# Renaming the columns for latitude and longitude
stops = stops.rename(columns={'stop_lat':'lat', 'stop_lon':'lng'})

# creating the geometry column 
stops['geom'] = gpd.points_from_xy(stops.lng, stops.lat) 

# dropping the old lat and lng columns 
stops = stops.drop(columns=['lat', 'lng', 'wheelchair_boarding', 'platform_code', 'location_type'])

In [None]:
# check column names
catchments_primary.columns.tolist()

catchments_primary = catchments_primary.drop(columns = ['YEAR7','YEAR8','YEAR9','YEAR10','YEAR11','YEAR12', 'PRIORITY', 'ADD_DATE'])


In [None]:
catchments_secondary.columns.tolist()

# drop irrelevant columns 
catchments_secondary = catchments_secondary.drop(columns = ['KINDERGART','YEAR1','YEAR2','YEAR3','YEAR4','YEAR5','YEAR6', 'PRIORITY', 'ADD_DATE'])

In [None]:
catchments_secondary
catchments_future
catchments_primary

In [None]:
sa2_regions.columns.tolist()

# drop irrelevant columns
sa2_regions = sa2_regions.drop(columns=['CHG_FLAG21','CHG_LBL21'])

# drop rows with missing values
sa2_regions = sa2_regions.dropna(subset=['geometry', 'AREASQKM21'])

# filter for Greater Sydney
sa2_regions = sa2_regions[sa2_regions['GCC_NAME21']=='Greater Sydney']
sa2_regions # only 373 entries in Greater Sydney

In [None]:
# A list of SA4 names for you guys to choose from
sa2_regions.SA4_NAME21.unique().tolist()

sa2_regions[sa2_regions['SA4_NAME21'] == 'Sydney - Eastern Suburbs']

## Data Transformation

In [None]:
srid = 4326
stops['geom'] = stops['geom'].apply(lambda x: WKTElement(x.wkt, srid=srid))
stops

In [None]:
def create_wkt_element(geom, srid):
    if geom.geom_type == 'Polygon':
        geom = MultiPolygon([geom])
    return WKTElement(geom.wkt, srid)

# modify Wk 8 tutorial codes into a function 
def transform_spatial_col(df:pd.DataFrame, spatial_col:str,srid:int):
    df_og = df.copy()  # creating a copy of the original for later
    df['geom'] = df[spatial_col].apply(lambda x: create_wkt_element(geom=x,srid=srid))  # applying the create_wkt_element
    df = df.drop(columns=spatial_col)  # deleting the old copy
    return df 

In [None]:
transform_spatial_col(catchments_primary, 'geometry', 4326)

In [None]:
transform_spatial_col(catchments_future, 'geometry', 4326)

In [None]:
transform_spatial_col(catchments_secondary, 'geometry', 4326)

In [None]:

dict(population.dtypes)


In [None]:
from sqlalchemy import create_engine
import psycopg2
import psycopg2.extras
import json

credentials = "Credentials.json"

def pgconnect(credential_filepath, db_schema="public"):
    with open(credential_filepath) as f:
        db_conn_dict = json.load(f)
        host       = db_conn_dict['host']
        db_user    = db_conn_dict['user']
        db_pw      = db_conn_dict['password']
        default_db = db_conn_dict['user']
        port       = db_conn_dict['port']
        try:
            db = create_engine(f'postgresql+psycopg2://{db_user}:{db_pw}@{host}:{port}/{default_db}', echo=False)
            conn = db.connect()
            print('Connected successfully.')
        except Exception as e:
            print("Unable to connect to the database.")
            print(e)
            db, conn = None, None
        return db,conn

def query(conn, sqlcmd, args=None, df=True):
    result = pd.DataFrame() if df else None
    try:
        if df:
            result = pd.read_sql_query(sqlcmd, conn, params=args)
        else:
            result = conn.execute(text(sqlcmd), args).fetchall()
            result = result[0] if len(result) == 1 else result
    except Exception as e:
        print("Error encountered: ", e, sep='\n')
    return result

In [None]:
db, conn = pgconnect(credentials)

In [None]:
query(conn, "select PostGIS_Version()")

## Ingesting into SQL

In [None]:
conn.rollback()

conn.execute(text("""
DROP TABLE IF EXISTS income;
CREATE TABLE income (
    sa2_code21 INTEGER,
    sa2_name VARCHAR(100),
    earners INTEGER, 
    median_age INTEGER,
    median_income INTEGER,
    mean_income INTEGER
);"""
))

conn.execute(text("""
DROP TABLE IF EXISTS stops;
CREATE TABLE stops (
    stopid INTEGER UNIQUE,  
    stop_code INTEGER, 
    stop_name VARCHAR(100),
    parent_station INTEGER,
    geom GEOMETRY(POINT,4326)
);"""
))

In [None]:
conn.rollback()
income.to_sql("income", conn, if_exists='append', index=False)
df = query(conn, "select * from income")


In [None]:
df.drop_duplicates()

In [19]:

stops.to_sql('stops', conn, if_exists='append', index=False, dtype={'geom': Geometry('POINT', 4326)})
query(conn, "select * from stops")

Unnamed: 0,stop_id,stop_code,stop_name,stop_lat,stop_lon,location_type,parent_station,wheelchair_boarding,platform_code
0,200039,200039.0,"Central Station, Eddy Av, Stand A",-33.882206,151.206665,,200060,0,
1,200054,200054.0,"Central Station, Eddy Av, Stand D",-33.882042,151.206991,,200060,0,
2,200060,,Central Station,-33.884084,151.206292,1.0,,0,
3,201510,,Redfern Station,-33.891690,151.198866,1.0,,0,
4,201646,201646.0,"Redfern Station, Gibbons St, Stand B",-33.893329,151.198882,,201510,0,
...,...,...,...,...,...,...,...,...,...
114713,212753,212753.0,"Sydney Olympic Park Wharf, Side B",-33.822016,151.078797,,21271,1,B
114714,2137185,2137185.0,"Cabarita Wharf, Side A",-33.840669,151.116926,,21371,1,1A
114715,2137186,2137186.0,"Cabarita Wharf, Side B",-33.840769,151.116899,,21371,1,1B
114716,21501,21501.0,Parramatta Wharf,-33.813904,151.010577,,2150112,1,


In [None]:
target_url = 'https://maps.six.nsw.gov.au/arcgis/rest/services/public/NSW_POI/MapServer'

In [None]:
conn.close()
db.dispose()