## Preliminaries

In [1]:
# Import necessary libraries
import geopandas as gpd
import pandas as pd
import os

In [2]:
os.chdir("C:/Users/emshe/Desktop/BRAINSTATION/CAPSTONE/GIT_REPO")

## Helper functions

In [3]:
# Define function to examine dataframes

def examine_df(name,df):
    """
    Check basic info about a dataframe df
    """
    
    print(f"\n\nNumber of records in the {name} is: {len(df)}\n")
    print(f"The columns in the {name} are: {df.columns}\n")
    print(f"\n Other info about {name}:")
    display(df.info())
    print(f"\n\nSample of records in the {name}:")
    display(df.head(5))

## Load and examine geographic files

In [24]:
# Load the shapefiles
zones_gdf = gpd.read_file("DATA/GEOGRAPHIC/ZONE_2023/Zone_2023/Zone_2023.shp")
nbhds_gdf = gpd.read_file("DATA/GEOGRAPHIC/NBHD_2023/Nbhd_2023/Nbhd_2023.shp")
tracts_gdf = gpd.read_file("DATA/GEOGRAPHIC/TRACT_2021/tract_2021.shp")

In [10]:
# Import economic data csvs and obtain lists of Vancouver tracts and nbhds
rent_by_tract = pd.read_csv("C:/Users/emshe/Desktop/BRAINSTATION/CAPSTONE/GIT_REPO/DATA/ECONOMIC/PROCESSED/avg_rent_by_tract.csv", dtype={'Tract': str})
rent_by_nbhd = pd.read_csv("C:/Users/emshe/Desktop/BRAINSTATION/CAPSTONE/GIT_REPO/DATA/ECONOMIC/PROCESSED/avg_rent_by_neigh.csv")

# Obtain lists of Vancouver neighborhoods and tracts
van_nbhds = rent_by_nbhd['Neigh'].unique()
van_tracts = rent_by_tract['Tract'].unique()

In [11]:
# Examine shapefiles

gdfs = {"Zones Dataframe":zones_gdf,"Neighborhood dataframe":nbhds_gdf, "Census tract dataframe": tracts_gdf}
for name,df in gdfs.items():
    examine_df(name,df)



Number of records in the Zones Dataframe is: 521

The columns in the Zones Dataframe are: Index(['zone', 'zone_geometry'], dtype='object')


 Other info about Zones Dataframe:
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 521 entries, 0 to 520
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   zone           521 non-null    object  
 1   zone_geometry  521 non-null    geometry
dtypes: geometry(1), object(1)
memory usage: 8.3+ KB


None



Sample of records in the Zones Dataframe:


Unnamed: 0,zone,zone_geometry
0,Abbotsford,"POLYGON ((-13618744.996 6293386.567, -13618650..."
1,Mission,"POLYGON ((-13621492.351 6336339.105, -13621109..."
2,Barrie,"POLYGON ((-8881653.78 5564338.457, -8881238.81..."
3,City of Belleville,"POLYGON ((-8586806.986 5526375.747, -8585636.5..."
4,City of Quinte West,"POLYGON ((-8633033.476 5539563.948, -8632904.4..."




Number of records in the Neighborhood dataframe is: 986

The columns in the Neighborhood dataframe are: Index(['nbhd', 'nbhd_geometry'], dtype='object')


 Other info about Neighborhood dataframe:
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 986 entries, 0 to 985
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   nbhd           986 non-null    object  
 1   nbhd_geometry  986 non-null    geometry
dtypes: geometry(1), object(1)
memory usage: 15.5+ KB


None



Sample of records in the Neighborhood dataframe:


Unnamed: 0,nbhd,nbhd_geometry
0,Mill Lake,"POLYGON ((-13614863.669 6283280.399, -13614864..."
1,Abbotsford/McMillan,"POLYGON ((-13613246.325 6283847.766, -13613059..."
2,Townline/Clearbrook,"POLYGON ((-13613010.778 6288767.654, -13612878..."
3,Mt. Lehman/Aberdeen/Poplar,"POLYGON ((-13621010.048 6293378.663, -13621041..."
4,Whatcom/Sumas,"POLYGON ((-13604657.482 6297077.561, -13604631..."




Number of records in the Census tract dataframe is: 6247

The columns in the Census tract dataframe are: Index(['tract', 'tract_geometry'], dtype='object')


 Other info about Census tract dataframe:
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 6247 entries, 0 to 6246
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   tract           6247 non-null   object  
 1   tract_geometry  6247 non-null   geometry
dtypes: geometry(1), object(1)
memory usage: 97.7+ KB


None



Sample of records in the Census tract dataframe:


Unnamed: 0,tract,tract_geometry
0,5370001.08,"POLYGON ((7196507.366 869787.991, 7196501.617 ..."
1,10002.0,"POLYGON ((8980216.643 2151065.36, 8980377.609 ..."
2,5370001.09,"POLYGON ((7196437.003 869160.246, 7196434.403 ..."
3,5370120.02,"POLYGON ((7189475.703 865662.849, 7189448.943 ..."
4,10006.0,"POLYGON ((8980091.143 2152478.609, 8980100.254..."


## Clean and Preprocess GDFs

In [25]:
# Drop unnecessary columns from gdfs and rename remaining columns

# Drop columns from zone gdf
zone_columns = ['ZONENAME_E', 'geometry']
zones_gdf = zones_gdf[zone_columns]

# Rename columns
zones_gdf.rename(columns = {'ZONENAME_E': 'zone', 'geometry':'zone_geometry'},inplace = True)


# Restrict neighborhoods to Vancouver before dropping columns
nbhds_gdf = nbhds_gdf[nbhds_gdf['METCODE'] == '2410']

# Drop columns from neighborhood gdf
nbhd_columns = ['NBHDNAME_E', 'geometry']
nbhds_gdf = nbhds_gdf[nbhd_columns]

# Rename columns
nbhds_gdf.rename(columns = {'NBHDNAME_E': 'nbhd', 'geometry':'nbhd_geometry'},inplace = True)


# Drop columns from neighborhood gdf
tract_columns = ['CTUID', 'geometry']
tracts_gdf = tracts_gdf[tract_columns]

# Rename columns
tracts_gdf.rename(columns = {'CTUID': 'tract', 'geometry':'tract_geometry'},inplace = True)


In [26]:
# Rename neighborhoods in nbhds_gdf

nbhd_renames = {
    'Hastings/Grandview/Woodlands': 'Hastings/Sunrise/Grandview/Woodlands',
    'Dundrave/W Vancouver Remainder': 'Dundrave/West Vancouver Remainder'
}

# Apply the replacements
nbhds_gdf['nbhd'] = nbhds_gdf['nbhd'].replace(nbhd_renames)

## Restrict neighborhoods and census tracts to match CMHC economic data

In [27]:
# Restrict to neighborhoods appearing in CMHC economic data

nbhds_gdf = nbhds_gdf[nbhds_gdf['nbhd'].isin(van_nbhds)]
examine_df('Vancouver Neighborhoods GDF',nbhds_gdf)



Number of records in the CMHC Neighborhood DF is: 68

The columns in the CMHC Neighborhood DF are: Index(['nbhd', 'nbhd_geometry'], dtype='object')


 Other info about CMHC Neighborhood DF:
<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 68 entries, 753 to 822
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   nbhd           68 non-null     object  
 1   nbhd_geometry  68 non-null     geometry
dtypes: geometry(1), object(1)
memory usage: 1.6+ KB


None



Sample of records in the CMHC Neighborhood DF:


Unnamed: 0,nbhd,nbhd_geometry
753,West End/Stanley Park North,"POLYGON ((-13707903.991 6324277.893, -13707728..."
754,West End/Stanley Park South,"POLYGON ((-13709954.808 6328768.906, -13708979..."
755,English Bay,"POLYGON ((-13708084.955 6323746.439, -13707920..."
756,Downtown Central,"POLYGON ((-13706769.27 6325491.913, -13706673...."
757,North False Creek,"POLYGON ((-13705341.854 6321743.569, -13705338..."


In [29]:

len(van_tracts)

366

In [28]:
# Restrict to tracts appearing in CMHC economic data

tracts_gdf = tracts_gdf[tracts_gdf['tract'].isin(van_tracts)]
examine_df('Vancouver Tracts GDF',tracts_gdf)



Number of records in the Vancouver Tracts GDF is: 327

The columns in the Vancouver Tracts GDF are: Index(['tract', 'tract_geometry'], dtype='object')


 Other info about Vancouver Tracts GDF:
<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 327 entries, 564 to 6136
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   tract           327 non-null    object  
 1   tract_geometry  327 non-null    geometry
dtypes: geometry(1), object(1)
memory usage: 7.7+ KB


None



Sample of records in the Vancouver Tracts GDF:


Unnamed: 0,tract,tract_geometry
564,9330101.03,"POLYGON ((4023460.017 2008667.226, 4023365.523..."
572,9330101.04,"POLYGON ((4024026.294 2009125.22, 4023968.611 ..."
578,9330014.01,"POLYGON ((4019507.649 1999133.317, 4019495.531..."
591,9330014.02,"POLYGON ((4020090.98 2000003.797, 4020068.209 ..."
635,9330292.03,"POLYGON ((4042685.489 1992879.183, 4042679.686..."


## Use geometry columns to merge shapefile datasets

In [27]:
# Perform spatial join: each neighborhood gets its parent zone's info

# Ensure CRS matches
van_nbhds_df = van_nbhds_df.to_crs(van_zones_df.crs)


van_nbhds_with_zone = gpd.sjoin(
    van_nbhds_df,
    van_zones_df[['ZONECODE', 'ZONENAME_E', 'zone_geometry']],
    how='left',
    predicate='within'
)