This notebook aims to bring in various liveability features and aggregate them by suburb to be used to answer the 3 key questions

In [12]:
#Import Dependancies
import pandas as pd
import geopandas as gpd
import numpy as np
import os
from shapely.geometry import Point

In [13]:
# Define Functions 

# Define a list of direction words
direction_words = ['north', 'south', 'east', 'west']

# Function to clean suburb names by removing direction words and 
# converting to lowercase 
def clean_suburb_name(suburb):
    suburb = suburb.lower()  # Convert to lowercase
    for word in direction_words:
        suburb = suburb.replace(word, '').strip()
    return suburb

In [14]:
import os
os.getcwd()

'd:\\Sampling\\real-estate-industry-project-open-source-industry-project-40\\notebooks\\preprocessing'

In [15]:

# Load Suburb data
landing_directory = '../../data/landing'
vic_suburbs = gpd.read_file(f"{landing_directory}/GDA2020/vic_localities.shp")

#Rename and clean suburb names
vic_suburbs.rename(columns={'LOC_NAME': 'suburb_name'}, inplace=True)
vic_suburbs['suburb_name_cleaned'] = \
    vic_suburbs['suburb_name'].apply(clean_suburb_name)
vic_suburbs.head()

# Check for duplicates in the 'suburb' column of aggregate_df
duplicates_in_vic_suburbs=\
    vic_suburbs[vic_suburbs.duplicated(subset='suburb_name_cleaned', 
                                       keep=False)]

# Drop duplicates
vic_suburbs = vic_suburbs.drop_duplicates(subset='suburb_name_cleaned', 
                                          keep='first')



In [16]:
# Find number of schools in each suburb

# Load Schools data
schools = gpd.read_file("../../data/landing/schools")
schools.head()

# Map geometry to determine which suburb each school is in 
schools['geometry'] = \
    schools.apply(lambda row: Point(row['X'], row['Y']), axis=1)
joined_data = gpd.sjoin(vic_suburbs, schools, op='contains')

# Group by suburb and count how many schools are in each 
school_counts = \
    joined_data.groupby('suburb_name_cleaned')['School_Name']\
        .count().reset_index()
school_counts.rename(columns={'School_Name': 'School_Count'}, inplace=True)

# Create the 'aggregate_df' by merging 'vic_suburbs' 
# and 'school_counts' on 'suburb_name_cleaned' with a left join
aggregate_df = \
    vic_suburbs[['suburb_name_cleaned']].merge(school_counts, 
                                               left_on='suburb_name_cleaned', 
                                               right_on='suburb_name_cleaned',
                                                 how='left')

# Rename columns as per your requirement
aggregate_df.rename(columns=
                    {'suburb_name_cleaned': 'suburb', 
                     'School_Count': 'num_schools'}, inplace=True)

# Fill NaN values in 'num_schools' with 0
aggregate_df['num_schools'].fillna(0, inplace=True)

# Print or use 'aggregate_df' as needed
print(aggregate_df)


  if await self.run_code(code, result, async_=asy):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:7844
Right CRS: None

  joined_data = gpd.sjoin(vic_suburbs, schools, op='contains')


          suburb  num_schools
0       abbeyard          0.0
1     abbotsford          2.0
2     aberfeldie          4.0
3      aberfeldy          0.0
4        acheron          0.0
...          ...          ...
2632     yundool          0.0
2633      yuroke          0.0
2634     yuulong          0.0
2635     zeerust          1.0
2636   zumsteins          0.0

[2637 rows x 2 columns]


In [17]:
# Load Healthcare data
healthcare = gpd.read_file(f"{landing_directory}/healthcare_data.csv")
healthcare.head()

# Clean suburb names in healthcare data
healthcare['suburb_cleaned'] = healthcare['Suburb'].apply(clean_suburb_name)

# manually map the 4 hospitals with discrepancies
# Define the mapping dictionary for manual corrections
corrections = {
    'albury': 'Wodonga',
    'koo wee rup': 'Koo Wee Rup',
    'lower templestowe': 'Templestowe Lower',
    'mt eliza': 'Mount Eliza'
}

# Apply the corrections and create final 'suburb_name' column to use for mapping 
healthcare['suburb_name'] = healthcare['suburb_cleaned'].replace(corrections)

# Group by suburb and count how many hospitals are in each
hospital_counts = \
    healthcare.groupby('suburb_name')['Formal Name'].count().reset_index()
hospital_counts.rename(columns={'Formal Name': 'Hospital_Count'}, inplace=True)
print(hospital_counts.head())

# Create the 'final_df' by merging 'aggregate_df' and 'hospital_counts' 
# on the 'suburb' column with a left join
aggregate_df = \
    aggregate_df.merge(hospital_counts, 
                       left_on='suburb', right_on='suburb_name', how='left')

# Drop the 'suburb_name' column from 'hospital_counts' 
# (if not needed in the final result)
hospital_counts.drop('suburb_name', axis=1, inplace=True)

# Fill NaN values in 'Hospital_Count' with 0
aggregate_df['Hospital_Count'].fillna(0, inplace=True)
aggregate_df.drop(columns=['suburb_name'], inplace=True)

# Print or use 'final_df' as needed
print(aggregate_df)

         suburb_name  Hospital_Count
0        Mount Eliza               1
1  Templestowe Lower               1
2            Wodonga               1
3          alexandra               1
4             altona               1
          suburb  num_schools  Hospital_Count
0       abbeyard          0.0             0.0
1     abbotsford          2.0             0.0
2     aberfeldie          4.0             0.0
3      aberfeldy          0.0             0.0
4        acheron          0.0             0.0
...          ...          ...             ...
2632     yundool          0.0             0.0
2633      yuroke          0.0             0.0
2634     yuulong          0.0             0.0
2635     zeerust          1.0             0.0
2636   zumsteins          0.0             0.0

[2637 rows x 3 columns]


In [18]:
# Bring in Open Space Data

sf = \
    gpd.read_file(f"{landing_directory}/openspace_locations/\
VPA_Draft_Open_Space_Data.shp")

# Spatial join open spaces with suburbs
open_space_with_suburbs = gpd.sjoin(sf, vic_suburbs, how='left', op='within')

# Extract relevant columns (e.g., 'OpenSpaceName' and 'Suburb') from the result
open_space_with_suburbs = open_space_with_suburbs[['FID', 
                                                   'suburb_name_cleaned']]
suburb_open_space_counts = open_space_with_suburbs\
    .groupby('suburb_name_cleaned').size().reset_index(name='OpenSpaceCount')

# Merge suburb_open_space_counts with aggregate_df
aggregate_df = aggregate_df.merge(suburb_open_space_counts, left_on='suburb', 
                                  right_on='suburb_name_cleaned', how='left')

# Fill NaN values in 'OpenSpaceCount' with 0
aggregate_df['OpenSpaceCount'].fillna(0, inplace=True)
aggregate_df.drop(columns=['suburb_name_cleaned'], inplace=True)

  if await self.run_code(code, result, async_=asy):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: EPSG:7844

  open_space_with_suburbs = gpd.sjoin(sf, vic_suburbs, how='left', op='within')


In [19]:
# Bring in PTV Data
LANDING_DATA_DIR_PTV = "../../data/landing/ptv"
RAW_DATA_DIR_PTV = "../../data/raw/ptv"

# Load Shapefiles

metro_buses_stops_gdf = \
    gpd.read_file(f"{RAW_DATA_DIR_PTV}/metro_buses_stops.shp")
metro_trains_stops_gdf = \
    gpd.read_file(f"{RAW_DATA_DIR_PTV}/metro_trains_stops.shp")
metro_trams_stops_gdf = \
    gpd.read_file(f"{RAW_DATA_DIR_PTV}/metro_trams_stops.shp")
reg_buses_stops_gdf = \
    gpd.read_file(f"{RAW_DATA_DIR_PTV}/reg_buses_stops.shp")
reg_coaches_stops_gdf = \
    gpd.read_file(f"{RAW_DATA_DIR_PTV}/reg_coaches_stops.shp")
reg_trains_stops_gdf = \
    gpd.read_file(f"{RAW_DATA_DIR_PTV}/reg_trains_stops.shp")

# Count different types of stops within each suburb

metro_buses_stops_with_suburbs = \
    gpd.sjoin(metro_buses_stops_gdf, vic_suburbs, how='left', op='within')
metro_trains_stops_with_suburbs = \
    gpd.sjoin(metro_trains_stops_gdf, vic_suburbs, how='left', op='within')
metro_trams_stops_with_suburbs = \
    gpd.sjoin(metro_trams_stops_gdf, vic_suburbs, how='left', op='within')
reg_buses_stops_with_suburbs = \
    gpd.sjoin(reg_buses_stops_gdf, vic_suburbs, how='left', op='within')
reg_coaches_stops_with_suburbs = \
    gpd.sjoin(reg_coaches_stops_gdf, vic_suburbs, how='left', op='within')
reg_trains_stops_with_suburbs = \
    gpd.sjoin(reg_trains_stops_gdf, vic_suburbs, how='left', op='within')

# Group by suburb and count the number of stops for each type of public trans
metro_buses_counts = \
    metro_buses_stops_with_suburbs\
        .groupby('suburb_name_cleaned').size().\
            reset_index(name='MetroBusesCount')
metro_trains_counts = \
    metro_trains_stops_with_suburbs.groupby('suburb_name_cleaned')\
        .size().reset_index(name='MetroTrainsCount')
metro_trams_counts =\
      metro_trams_stops_with_suburbs.groupby('suburb_name_cleaned')\
        .size().reset_index(name='MetroTramsCount')
reg_buses_counts = \
    reg_buses_stops_with_suburbs.groupby('suburb_name_cleaned')\
        .size().reset_index(name='RegBusesCount')
reg_coaches_counts = \
    reg_coaches_stops_with_suburbs.groupby('suburb_name_cleaned')\
        .size().reset_index(name='RegCoachesCount')
reg_trains_counts = \
    reg_trains_stops_with_suburbs.groupby('suburb_name_cleaned')\
        .size().reset_index(name='RegTrainsCount')


# Merge the counts for each type of stop into a single DataFrame
transport_stops_counts = metro_buses_counts.merge(metro_trains_counts, 
                                                  on='suburb_name_cleaned', 
                                                  how='left')
transport_stops_counts = transport_stops_counts.merge(metro_trams_counts, 
                                                      on='suburb_name_cleaned',
                                                        how='left')
transport_stops_counts = transport_stops_counts.merge(reg_buses_counts, 
                                                      on='suburb_name_cleaned',
                                                        how='left')
transport_stops_counts = transport_stops_counts.merge(reg_coaches_counts, 
                                                      on='suburb_name_cleaned', 
                                                      how='left')
transport_stops_counts = transport_stops_counts.merge(reg_trains_counts, 
                                                      on='suburb_name_cleaned',
                                                        how='left')

# Sum the counts for each type of stop to get the total public transport stops
transport_stops_counts['TotalPublicTransportStops'] = \
    transport_stops_counts[['MetroBusesCount', 'MetroTrainsCount', 
                            'MetroTramsCount', 'RegBusesCount', 
                            'RegCoachesCount', 'RegTrainsCount']].sum(axis=1)

# Define the columns to include in the TotalBusStops and TotalTrainStops
bus_columns = ['MetroBusesCount', 'RegBusesCount', 'RegCoachesCount']
train_columns = ['MetroTrainsCount', 'RegTrainsCount']

# Create the TotalBusStops and TotalTrainStops columns and drop sub categories
transport_stops_counts['TotalBusStops'] = \
    transport_stops_counts[bus_columns].sum(axis=1)
transport_stops_counts['TotalTrainStops'] = \
    transport_stops_counts[train_columns].sum(axis=1)
transport_stops_counts = \
    transport_stops_counts.drop(columns=bus_columns)
transport_stops_counts = \
    transport_stops_counts.drop(columns=train_columns)

# Merge transport_stops_counts with aggregate_df
aggregate_df = \
    aggregate_df.merge(transport_stops_counts, 
                       left_on='suburb', right_on='suburb_name_cleaned', 
                       how='left')

# Fill NaN values in count columns with 0
columns_to_fill = ['TotalBusStops', 'TotalTrainStops', 
                   'TotalPublicTransportStops']
aggregate_df[columns_to_fill] = aggregate_df[columns_to_fill]\
    .fillna(0).astype(int)
aggregate_df.drop(columns=['suburb_name_cleaned'], inplace=True)

  if await self.run_code(code, result, async_=asy):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:7844

  gpd.sjoin(metro_buses_stops_gdf, vic_suburbs, how='left', op='within')
  if await self.run_code(code, result, async_=asy):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:7844

  gpd.sjoin(metro_trains_stops_gdf, vic_suburbs, how='left', op='within')
  if await self.run_code(code, result, async_=asy):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:7844

  gpd.sjoin(metro_trams_stops_gdf, vic_suburbs, how='left', op='within')
  if await self.run_code(code, result, async_=asy):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:7844

  gpd.sjoin(reg_buses_stops_gdf, vic_suburbs, how='left', op='within')
 

In [20]:
#Load crime data

crime_path = '../../data/landing/crime_data.xlsx'
crime_df = pd.read_excel(crime_path,sheet_name='Table 03')
crime_df['crime_suburb'] = crime_df['Suburb/Town Name']\
    .apply(clean_suburb_name)

# Group crime by suburb
crime_counts = crime_df.groupby('crime_suburb').size()\
    .reset_index(name='crime_count')

# Join to aggregate df
aggregate_df = aggregate_df.merge( crime_counts, left_on='suburb',
                                   right_on='crime_suburb', how='left')
aggregate_df = aggregate_df.drop(columns=['crime_suburb'])


In [22]:
aggregate_df[aggregate_df['suburb'] == 'melbourne']

Unnamed: 0,suburb,num_schools,Hospital_Count,OpenSpaceCount,MetroTramsCount,TotalPublicTransportStops,TotalBusStops,TotalTrainStops,crime_count
754,melbourne,1.0,14.0,53.0,22.0,32,9,1,4517.0


In [23]:
# Rename columns for consistency
# Create a mapping of old column names to new column names
column_mapping = {
    'num_schools': 'schools_count',
    'Hospital_Count': 'hospital_count',
    'OpenSpaceCount': 'open_space_count',
    'MetroTramsCount': 'trams_count',
    'TotalPublicTransportStops': 'public_transport_stops_count',
    'TotalBusStops': 'bus_stops_count',
    'TotalTrainStops': 'train_stops_count',
    'crime_count': 'crime_count'
}

# Use the rename() method to rename the columns
aggregate_df = aggregate_df.rename(columns=column_mapping)

# Save to curated data folder 

CURATED_DATA_DIR = "../../data/curated"
filename = "liveability_features.csv"
aggregate_df.to_csv(f"{CURATED_DATA_DIR}/{filename}", index=False)