## Preliminaries

In [16]:
# Import necessary libraries
import geopandas as gpd
import pandas as pd
import json
import os

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = 'notebook_connected' # For plotly graphs to render in this environment

In [17]:
# Set directory
os.chdir("C:/Users/emshe/Desktop/BRAINSTATION/CAPSTONE/GIT_REPO")

## Helper functions

In [18]:
# Define function to examine dataframes

def examine_df(name,df):
    """
    Check basic info about a dataframe df
    """
    
    print(f"\n\nNumber of records in the {name} is: {len(df)}\n")
    print(f"The columns in the {name} are: {df.columns}\n")
    print(f"\n Other info about {name}:")
    display(df.info())
    print(f"\n\nSample of records in the {name}:")
    display(df.head(5))

In [19]:
# Define function to map geographic boundaries for some geographic dataframe

def map_boundaries(gdf, name_col, geo_col, title):
    """
    Map geographic regions with boundaries
    """
    
    # Prepare geographic dataframe for conversion into json format
    new_gdf = gdf.copy()    
    new_gdf = new_gdf.set_geometry(geo_col)
    new_gdf['dummy'] = [i for i in range(len(new_gdf))]
    new_gdf = new_gdf.to_crs("EPSG:4326")

    # Use geopandas to generate a valid geojson
    geojson = new_gdf.set_index(name_col).__geo_interface__
    geojson = json.loads(new_gdf.to_json())
    
    # Plot
    fig = px.choropleth_map(
                data_frame = new_gdf,
                   geojson = geojson,
                 locations = name_col,
                     color = "dummy",
              featureidkey = f"properties.{name_col}",  # GeoJSON path
                 map_style = "carto-positron",
                    center = {"lat": 49.25, "lon": -123.1},
                      zoom = 8,
                   opacity = 0.6,
                     title = title
    )
    fig.update_traces(marker_line_width=0.5, marker_line_color='black')
    # fig.update_layout(
    #     geo=dict(fitbounds="locations", visible=False),
    #     margin={"r": 0, "t": 30, "l": 0, "b": 0}
    # )
    
    fig.update_layout(margin={"r":0,"t":30,"l":0,"b":0})
    
    fig.show()

## Load and examine geographic files

In [20]:
# Load the shapefiles
zones_gdf = gpd.read_file("DATA/GEOGRAPHIC/ZONE_2023/Zone_2023/Zone_2023.shp")
nbhds_gdf = gpd.read_file("DATA/GEOGRAPHIC/NBHD_2023/Nbhd_2023/Nbhd_2023.shp")

In [21]:
# Import economic data csvs and obtain lists of Vancouver tracts and nbhds

rent_by_nbhd = pd.read_csv("C:/Users/emshe/Desktop/BRAINSTATION/CAPSTONE/GIT_REPO/DATA/ECONOMIC/PROCESSED/avg_rent_by_neigh.csv")

# Obtain lists of Vancouver zones, neighborhoods, and tracts

van_zones = ['West End/Stanley Park', 'English Bay', 'Downtown',
       'South Granville/Oak', 'Kitsilano/Point Grey',
       'Westside/Kerrisdale', 'Marpole', 'Mount Pleasant/Renfrew Heights',
       'East Hastings', 'Southeast Vancouver',
       'University Endowment Lands', 'Central Park/Metrotown',
       'Southeast Burnaby', 'North Burnaby', 'New Westminster',
       'North Vancouver CY', 'North Vancouver DM', 'West Vancouver',
       'Richmond', 'Delta', 'Surrey', 'White Rock',
       'Langley City and Langley DM', 'Tri-Cities',
       'Maple Ridge/Pitt Meadows'] # copy and pasted from 02_economic_data_loading_cleaning.ipynb

van_nbhds = rent_by_nbhd['Neigh'].unique()


In [22]:
# Examine shapefiles

gdfs = {"Zones Dataframe":zones_gdf,"Neighborhood dataframe":nbhds_gdf}
for name,df in gdfs.items():
    examine_df(name,df)



Number of records in the Zones Dataframe is: 521

The columns in the Zones Dataframe are: Index(['OBJECTID', 'METZONE_UI', 'METCODE', 'ZONECODE', 'ZONENAME_E',
       'ZONENAME_F', 'ZONENAMELO', 'ZONENAME_1', 'YEAR', 'SHAPE_Leng',
       'SHAPE_Area', 'geometry'],
      dtype='object')


 Other info about Zones Dataframe:
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 521 entries, 0 to 520
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   OBJECTID    521 non-null    int64         
 1   METZONE_UI  521 non-null    object        
 2   METCODE     521 non-null    object        
 3   ZONECODE    521 non-null    object        
 4   ZONENAME_E  521 non-null    object        
 5   ZONENAME_F  521 non-null    object        
 6   ZONENAMELO  521 non-null    object        
 7   ZONENAME_1  521 non-null    object        
 8   YEAR        521 non-null    datetime64[ms]
 9   SHAPE_Leng  521 non-null    

None



Sample of records in the Zones Dataframe:


Unnamed: 0,OBJECTID,METZONE_UI,METCODE,ZONECODE,ZONENAME_E,ZONENAME_F,ZONENAMELO,ZONENAME_1,YEAR,SHAPE_Leng,SHAPE_Area,geometry
0,1,11001,110,1,Abbotsford,Abbotsford,Abbotsford,Abbotsford,2023-02-01,170020.862842,906834500.0,"POLYGON ((-13618744.996 6293386.567, -13618650..."
1,2,11002,110,2,Mission,Mission,Mission,Mission,2023-02-01,118557.171064,614785700.0,"POLYGON ((-13621492.351 6336339.105, -13621109..."
2,3,12001,120,1,Barrie,Barrie,Barrie,Barrie,2023-02-01,241445.304719,1896428000.0,"POLYGON ((-8881653.78 5564338.457, -8881238.81..."
3,4,12201,122,1,City of Belleville,Ville de Belleville,City of Belleville,Ville de Belleville,2023-02-01,171216.88293,1137410000.0,"POLYGON ((-8586806.986 5526375.747, -8585636.5..."
4,5,12202,122,2,City of Quinte West,Ville de Quinte West,City of Quinte West,Ville de Quinte West,2023-02-01,204383.860705,1595469000.0,"POLYGON ((-8633033.476 5539563.948, -8632904.4..."




Number of records in the Neighborhood dataframe is: 986

The columns in the Neighborhood dataframe are: Index(['OBJECTID', 'METNBHD_UI', 'METCODE', 'NBHDCODE', 'NBHDNAME_E',
       'NBHDNAME_F', 'NBHDNAMELO', 'NBHDNAME_1', 'ZONECODE', 'SHAPE_Leng',
       'SHAPE_Area', 'geometry'],
      dtype='object')


 Other info about Neighborhood dataframe:
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 986 entries, 0 to 985
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   OBJECTID    986 non-null    int64   
 1   METNBHD_UI  986 non-null    object  
 2   METCODE     986 non-null    object  
 3   NBHDCODE    986 non-null    object  
 4   NBHDNAME_E  986 non-null    object  
 5   NBHDNAME_F  986 non-null    object  
 6   NBHDNAMELO  986 non-null    object  
 7   NBHDNAME_1  986 non-null    object  
 8   ZONECODE    986 non-null    object  
 9   SHAPE_Leng  986 non-null    float64 
 10  SHAPE_Area  986 non-null   

None



Sample of records in the Neighborhood dataframe:


Unnamed: 0,OBJECTID,METNBHD_UI,METCODE,NBHDCODE,NBHDNAME_E,NBHDNAME_F,NBHDNAMELO,NBHDNAME_1,ZONECODE,SHAPE_Leng,SHAPE_Area,geometry
0,1,110150,110,150,Mill Lake,Mill Lake,Mill Lake,Mill Lake,1,14520.6777,7132281.0,"POLYGON ((-13614863.669 6283280.399, -13614864..."
1,2,110300,110,300,Abbotsford/McMillan,Abbotsford/McMillan,Abbotsford/McMillan,Abbotsford/McMillan,1,15496.072027,10240460.0,"POLYGON ((-13613246.325 6283847.766, -13613059..."
2,3,110450,110,450,Townline/Clearbrook,Townline/Clearbrook,Townline/Clearbrook,Townline/Clearbrook,1,55574.561193,73185530.0,"POLYGON ((-13613010.778 6288767.654, -13612878..."
3,4,110600,110,600,Mt. Lehman/Aberdeen/Poplar,Mt. Lehman/Aberdeen/Poplar,Mt. Lehman/Aberdeen/Poplar,Mt. Lehman/Aberdeen/Poplar,1,111489.388041,298487300.0,"POLYGON ((-13621010.048 6293378.663, -13621041..."
4,5,110750,110,750,Whatcom/Sumas,Whatcom/Sumas,Whatcom/Sumas,Whatcom/Sumas,1,151290.228325,517789000.0,"POLYGON ((-13604657.482 6297077.561, -13604631..."


## Clean and Preprocess GDFs

In [23]:
# Drop unnecessary columns from gdfs and rename remaining columns

# Restrict zones to Vancouver before dropping columns
zones_gdf = zones_gdf[zones_gdf['METCODE'] == '2410']

# Drop columns from zone gdf
zone_columns = ['ZONENAME_E', 'geometry']
zones_gdf = zones_gdf[zone_columns]

# Rename columns
zones_gdf = zones_gdf.rename(columns = {'ZONENAME_E': 'zone', 'geometry':'zone_geometry'})

# Set geometry column
zones_gdf = zones_gdf.set_geometry('zone_geometry')

# Restrict neighborhoods to Vancouver before dropping columns
nbhds_gdf = nbhds_gdf[nbhds_gdf['METCODE'] == '2410']

# Drop columns from neighborhood gdf
nbhd_columns = ['NBHDNAME_E', 'geometry']
nbhds_gdf = nbhds_gdf[nbhd_columns]

# Rename columns
nbhds_gdf =  nbhds_gdf.rename(columns = {'NBHDNAME_E': 'nbhd', 'geometry':'nbhd_geometry'})

# Set geometry column
nbhds_gdf = nbhds_gdf.set_geometry('nbhd_geometry')

In [24]:
# Rename neighborhoods in nbhds_gdf

nbhd_renames = {
    'Hastings/Grandview/Woodlands': 'Hastings/Sunrise/Grandview/Woodlands',
    'Dundrave/W Vancouver Remainder': 'Dundrave/West Vancouver Remainder'
}

# Apply the replacements
nbhds_gdf['nbhd'] = nbhds_gdf['nbhd'].replace(nbhd_renames)

## Restrict zones, neighborhoods, and census tracts to match CMHC economic data

In [25]:
# Restrict to zones appearing in CMHC economic data (desired number is 25)

zones_gdf = zones_gdf[zones_gdf['zone'].isin(van_zones)]
examine_df('Vancouver Zones GDF',zones_gdf)



Number of records in the Vancouver Zones GDF is: 25

The columns in the Vancouver Zones GDF are: Index(['zone', 'zone_geometry'], dtype='object')


 Other info about Vancouver Zones GDF:
<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 25 entries, 253 to 277
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   zone           25 non-null     object  
 1   zone_geometry  25 non-null     geometry
dtypes: geometry(1), object(1)
memory usage: 600.0+ bytes


None



Sample of records in the Vancouver Zones GDF:


Unnamed: 0,zone,zone_geometry
253,West End/Stanley Park,"POLYGON ((-13709954.808 6328768.906, -13708979..."
254,English Bay,"POLYGON ((-13708084.955 6323746.439, -13707920..."
255,Downtown,"POLYGON ((-13703925.052 6324911.504, -13704352..."
256,South Granville/Oak,"POLYGON ((-13706574.186 6320718, -13706430.27 ..."
257,Kitsilano/Point Grey,"POLYGON ((-13707703.297 6322228.01, -13707671...."


In [26]:
# Restrict to neighborhoods appearing in CMHC economic data (desired number is 68)

nbhds_gdf = nbhds_gdf[nbhds_gdf['nbhd'].isin(van_nbhds)]
examine_df('Vancouver Neighborhoods GDF',nbhds_gdf)



Number of records in the Vancouver Neighborhoods GDF is: 68

The columns in the Vancouver Neighborhoods GDF are: Index(['nbhd', 'nbhd_geometry'], dtype='object')


 Other info about Vancouver Neighborhoods GDF:
<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 68 entries, 753 to 822
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   nbhd           68 non-null     object  
 1   nbhd_geometry  68 non-null     geometry
dtypes: geometry(1), object(1)
memory usage: 1.6+ KB


None



Sample of records in the Vancouver Neighborhoods GDF:


Unnamed: 0,nbhd,nbhd_geometry
753,West End/Stanley Park North,"POLYGON ((-13707903.991 6324277.893, -13707728..."
754,West End/Stanley Park South,"POLYGON ((-13709954.808 6328768.906, -13708979..."
755,English Bay,"POLYGON ((-13708084.955 6323746.439, -13707920..."
756,Downtown Central,"POLYGON ((-13706769.27 6325491.913, -13706673...."
757,North False Creek,"POLYGON ((-13705341.854 6321743.569, -13705338..."


## Use geometry columns to merge shapefile datasets

In [27]:
# Perform spatial join: each neighborhood gets its parent zone's info

# Ensure CRS matches
nbhds_gdf = nbhds_gdf.to_crs(zones_gdf.crs)


nbhds_zones_gdf = gpd.sjoin(
    nbhds_gdf,
    zones_gdf[['zone', 'zone_geometry']],
    how='left',
    predicate='within'
)


examine_df('Neighborhoods with Zones GDF',nbhds_zones_gdf)



Number of records in the Neighborhoods with Zones GDF is: 68

The columns in the Neighborhoods with Zones GDF are: Index(['nbhd', 'nbhd_geometry', 'index_right', 'zone'], dtype='object')


 Other info about Neighborhoods with Zones GDF:
<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 68 entries, 753 to 822
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   nbhd           68 non-null     object  
 1   nbhd_geometry  68 non-null     geometry
 2   index_right    64 non-null     float64 
 3   zone           64 non-null     object  
dtypes: float64(1), geometry(1), object(2)
memory usage: 2.7+ KB


None



Sample of records in the Neighborhoods with Zones GDF:


Unnamed: 0,nbhd,nbhd_geometry,index_right,zone
753,West End/Stanley Park North,"POLYGON ((-13707903.991 6324277.893, -13707728...",253.0,West End/Stanley Park
754,West End/Stanley Park South,"POLYGON ((-13709954.808 6328768.906, -13708979...",253.0,West End/Stanley Park
755,English Bay,"POLYGON ((-13708084.955 6323746.439, -13707920...",254.0,English Bay
756,Downtown Central,"POLYGON ((-13706769.27 6325491.913, -13706673....",255.0,Downtown
757,North False Creek,"POLYGON ((-13705341.854 6321743.569, -13705338...",255.0,Downtown


In [30]:
# Manually add missing zones for four neighborhoods

# Manual zone assignment for unmatched neighborhoods
manual_zone_assignments = {
    'White Rock': 'White Rock',
    'South Surrey': 'Surrey',
    'North Vancouver DM East': 'North Vancouver DM',
    'Lonsdale East': 'North Vancouver CY'
}

# Apply manual assignments
nbhds_zones_gdf['zone'] = nbhds_zones_gdf.apply(
    lambda row: manual_zone_assignments[row['nbhd']]
    if pd.isna(row['zone']) and row['nbhd'] in manual_zone_assignments
    else row['zone'],
    axis=1
)

In [31]:
examine_df('Neighboorhods with Zones', nbhds_zones_gdf)



Number of records in the Neighboorhods with Zones is: 68

The columns in the Neighboorhods with Zones are: Index(['nbhd', 'nbhd_geometry', 'index_right', 'zone'], dtype='object')


 Other info about Neighboorhods with Zones:
<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 68 entries, 753 to 822
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   nbhd           68 non-null     object  
 1   nbhd_geometry  68 non-null     geometry
 2   index_right    64 non-null     float64 
 3   zone           68 non-null     object  
dtypes: float64(1), geometry(1), object(2)
memory usage: 2.7+ KB


None



Sample of records in the Neighboorhods with Zones:


Unnamed: 0,nbhd,nbhd_geometry,index_right,zone
753,West End/Stanley Park North,"POLYGON ((-13707903.991 6324277.893, -13707728...",253.0,West End/Stanley Park
754,West End/Stanley Park South,"POLYGON ((-13709954.808 6328768.906, -13708979...",253.0,West End/Stanley Park
755,English Bay,"POLYGON ((-13708084.955 6323746.439, -13707920...",254.0,English Bay
756,Downtown Central,"POLYGON ((-13706769.27 6325491.913, -13706673....",255.0,Downtown
757,North False Creek,"POLYGON ((-13705341.854 6321743.569, -13705338...",255.0,Downtown


In [32]:
# Process and export neighborhoods with zones geo-dataframe

nbhds_zones_gdf = nbhds_zones_gdf.drop(columns = ['index_right'])

nbhds_zones_gdf.to_file("C:/Users/emshe/Desktop/BRAINSTATION/CAPSTONE/GIT_REPO/DATA/GEOGRAPHIC/PROCESSED/nbhds_with_zones.geojson", driver="GeoJSON")

## Visualize map of zones and neighborhoods

In [14]:
# Visualize neighborhood boundaries
map_boundaries(nbhds_gdf,"nbhd","nbhd_geometry","Map of Vancouver Neighborhoods")

In [15]:
# Visualize zone boundaries
map_boundaries(zones_gdf,"zone","zone_geometry","Map of Vancouver Survey Zones")