In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import folium


COVID_PATH = '/Users/nonthachaiplodthong/Documents/Nons_work/non-1b/non-1b/geo-folium/data/COVID19 cases.csv'
WBGEO_PATH = '/Users/nonthachaiplodthong/Documents/Nons_work/non-1b/non-1b/geo-folium/data/WB-demographic.csv'

# There are 135 neighbourhood names that match between the two datasets. 
However, there are 6 neighbourhood names in the COVID-19 cases dataset and 5 names in the WB-demographic dataset that do not match. This indicates that there might be some neighbourhoods that are named differently between the two datasets or that are present in one dataset but not the other.

In [4]:
df_covid = pd.read_csv(COVID_PATH)
df_demograp = pd.read_csv(WBGEO_PATH)

In [5]:
covid_case_head = df_covid.head()
wb_demographic_head = df_demograp.head()

(covid_case_head, wb_demographic_head)

(   _id  Assigned_ID Outbreak Associated       Age Group  \
 0    1            1                  NO  50 to 59 Years   
 1    2            2                  NO  50 to 59 Years   
 2    3            3                  NO  20 to 29 Years   
 3    4            4                  NO  60 to 69 Years   
 4    5            5                  NO  60 to 69 Years   
 
       Neighbourhood Name  FSA Source of Infection Classification Episode Date  \
 0        Willowdale East  M2N              Travel      CONFIRMED   2020-01-22   
 1        Willowdale East  M2N              Travel      CONFIRMED   2020-01-21   
 2      Parkwoods-Donalda  M3A              Travel      CONFIRMED   2020-02-05   
 3  Church-Yonge Corridor  M4W              Travel      CONFIRMED   2020-02-16   
 4  Church-Yonge Corridor  M4W              Travel      CONFIRMED   2020-02-20   
 
   Reported Date Client Gender   Outcome Ever Hospitalized Ever in ICU  \
 0    2020-01-23        FEMALE  RESOLVED                No          No

In [6]:
covid_neighbourhoods = df_covid['Neighbourhood Name'].unique()
wb_neighbourhoods = df_demograp['Neighbourhood'].unique()


In [7]:
# Check for any mismatches or differences in the lists
matching_neighbourhoods = set(covid_neighbourhoods).intersection(wb_neighbourhoods)
non_matching_covid = set(covid_neighbourhoods) - matching_neighbourhoods
non_matching_wb = set(wb_neighbourhoods) - matching_neighbourhoods

(len(matching_neighbourhoods), len(non_matching_covid), len(non_matching_wb))

(135, 6, 5)

In [8]:
merged_df = pd.merge(df_covid, df_demograp, 
                     left_on='Neighbourhood Name', 
                     right_on='Neighbourhood', 
                     how='inner')

# Now let's count the number of COVID cases for each neighbourhood
covid_cases_by_area = merged_df.groupby('Neighbourhood Name').size().reset_index(name='Number of Cases')

# Display the result
covid_cases_by_area.head()

Unnamed: 0,Neighbourhood Name,Number of Cases
0,Agincourt North,3021
1,Agincourt South-Malvern West,2703
2,Alderwood,1434
3,Annex,3738
4,Banbury-Don Mills,3202


In [9]:
# Perform the merge operation
merged_df = pd.merge(df_covid, df_demograp, left_on='Neighbourhood Name', right_on='Neighbourhood', how='inner')

# Calculate the total number of COVID cases per neighbourhood
covid_cases_count = merged_df.groupby('Neighbourhood Name')['Assigned_ID'].count().reset_index(name='Total COVID Cases')

# Merge this count with the demographic data to calculate the percentage
merged_population_cases = pd.merge(covid_cases_count, df_demograp, left_on='Neighbourhood Name', right_on='Neighbourhood')

# Calculate the percentage of COVID cases in each area relative to the population
merged_population_cases['COVID Cases Percentage'] = (merged_population_cases['Total COVID Cases'] / merged_population_cases['Total Population']) * 100

# Select only the relevant columns to display
result_df = merged_population_cases[['Neighbourhood Name', 'Total Population', 'Total COVID Cases', 'COVID Cases Percentage']]

result_df.head()


Unnamed: 0,Neighbourhood Name,Total Population,Total COVID Cases,COVID Cases Percentage
0,Agincourt North,30280,3021,9.976882
1,Agincourt South-Malvern West,21990,2703,12.291951
2,Alderwood,11900,1434,12.05042
3,Annex,29180,3738,12.810144
4,Banbury-Don Mills,26910,3202,11.898922


In [10]:
shapefile = '/Users/nonthachaiplodthong/Documents/Nons_work/non-1b/non-1b/geo-folium/data/NEIGHBORHOODS_WGS84_2.shp'
gdf = gpd.read_file(shapefile)
gdf.head()


Unnamed: 0,AREA_S_CD,AREA_NAME,geometry
0,97,Yonge-St.Clair (97),"POLYGON ((-79.39119 43.68108, -79.39141 43.680..."
1,27,York University Heights (27),"POLYGON ((-79.50529 43.75987, -79.50488 43.759..."
2,38,Lansing-Westgate (38),"POLYGON ((-79.43998 43.76156, -79.44004 43.761..."
3,31,Yorkdale-Glen Park (31),"POLYGON ((-79.43969 43.70561, -79.44011 43.705..."
4,16,Stonegate-Queensway (16),"POLYGON ((-79.49262 43.64744, -79.49277 43.647..."


In [11]:
covid_neighbourhoods = df_covid['Neighbourhood Name'].unique()
shapefile_neighbourhoods = gdf['AREA_NAME'].unique()

# Find the common and non-matching neighbourhoods
common_nbh = set(covid_neighbourhoods).intersection(shapefile_neighbourhoods)
# intersection() method returns a set that contains the similarity between two or more sets
non_matching_nbh = set(covid_neighbourhoods) - common_nbh
non_matching_shapefile = set(shapefile_neighbourhoods) - common_nbh

#(gdf.head(), covid_neighbourhoods, shapefile_neighbourhoods, common_nbh, non_matching_nbh, non_matching_shapefile)
(gdf.head(), common_nbh, non_matching_nbh, non_matching_shapefile)


(  AREA_S_CD                     AREA_NAME  \
 0       097           Yonge-St.Clair (97)   
 1       027  York University Heights (27)   
 2       038         Lansing-Westgate (38)   
 3       031       Yorkdale-Glen Park (31)   
 4       016      Stonegate-Queensway (16)   
 
                                             geometry  
 0  POLYGON ((-79.39119 43.68108, -79.39141 43.680...  
 1  POLYGON ((-79.50529 43.75987, -79.50488 43.759...  
 2  POLYGON ((-79.43998 43.76156, -79.44004 43.761...  
 3  POLYGON ((-79.43969 43.70561, -79.44011 43.705...  
 4  POLYGON ((-79.49262 43.64744, -79.49277 43.647...  ,
 set(),
 {'Agincourt North',
  'Agincourt South-Malvern West',
  'Alderwood',
  'Annex',
  'Banbury-Don Mills',
  'Bathurst Manor',
  'Bay Street Corridor',
  'Bayview Village',
  'Bayview Woods-Steeles',
  'Bedford Park-Nortown',
  'Beechborough-Greenbrook',
  'Bendale',
  'Birchcliffe-Cliffside',
  'Black Creek',
  'Blake-Jones',
  'Briar Hill - Belgravia',
  'Bridle Path-Sunnybro

# For COVID-19 data: 
Briar Hill - Belgravia, Cabbagetown-South St. James Town, Danforth-East York, Mimico (includes Humber Bay Shores), North St. James Town, and some entries are NaN (which likely indicates missing data).

# For the shapefile data: 
Briar Hill-Belgravia, Cabbagetown-South St.James Town, Danforth East York, Mimico, North St.James Town.

In [12]:
def clean_area_name(area_name):
    if '(' in area_name:
        name = area_name.split('(')[0].strip() # .strip removes leading and trailing spaces
    else:
        name = area_name.strip()
    return name

In [13]:
gdf['Cleaned_AREA_NAME'] = gdf['AREA_NAME'].apply(clean_area_name)
gdf.head()

Unnamed: 0,AREA_S_CD,AREA_NAME,geometry,Cleaned_AREA_NAME
0,97,Yonge-St.Clair (97),"POLYGON ((-79.39119 43.68108, -79.39141 43.680...",Yonge-St.Clair
1,27,York University Heights (27),"POLYGON ((-79.50529 43.75987, -79.50488 43.759...",York University Heights
2,38,Lansing-Westgate (38),"POLYGON ((-79.43998 43.76156, -79.44004 43.761...",Lansing-Westgate
3,31,Yorkdale-Glen Park (31),"POLYGON ((-79.43969 43.70561, -79.44011 43.705...",Yorkdale-Glen Park
4,16,Stonegate-Queensway (16),"POLYGON ((-79.49262 43.64744, -79.49277 43.647...",Stonegate-Queensway


In [15]:
gdf_cleaned_nbh = gdf['Cleaned_AREA_NAME'].unique() # .unique() returns an array of unique values

In [16]:
common_nbh_cleaned = set(covid_neighbourhoods).intersection(gdf_cleaned_nbh)
covid_after_cleaned = set(covid_neighbourhoods) - common_nbh_cleaned
non_matching_shapefile_cleaned = set(gdf_cleaned_nbh) - common_nbh_cleaned

gdf['Cleaned_AREA_NAME'].head(), common_nbh_cleaned, covid_after_cleaned, non_matching_shapefile_cleaned

(0             Yonge-St.Clair
 1    York University Heights
 2           Lansing-Westgate
 3         Yorkdale-Glen Park
 4        Stonegate-Queensway
 Name: Cleaned_AREA_NAME, dtype: object,
 {'Agincourt North',
  'Agincourt South-Malvern West',
  'Alderwood',
  'Annex',
  'Banbury-Don Mills',
  'Bathurst Manor',
  'Bay Street Corridor',
  'Bayview Village',
  'Bayview Woods-Steeles',
  'Bedford Park-Nortown',
  'Beechborough-Greenbrook',
  'Bendale',
  'Birchcliffe-Cliffside',
  'Black Creek',
  'Blake-Jones',
  'Bridle Path-Sunnybrook-York Mills',
  'Broadview North',
  'Brookhaven-Amesbury',
  'Caledonia-Fairbank',
  'Casa Loma',
  'Centennial Scarborough',
  'Church-Yonge Corridor',
  'Clairlea-Birchmount',
  'Clanton Park',
  'Cliffcrest',
  'Corso Italia-Davenport',
  'Danforth',
  'Don Valley Village',
  'Dorset Park',
  'Dovercourt-Wallace Emerson-Junction',
  'Downsview-Roding-CFB',
  'Dufferin Grove',
  'East End-Danforth',
  'Edenbridge-Humber Valley',
  'Eglinton East',
  '

# For the COVID-19 data:
    - Cabbagetown-South St James Town
    - Danforth-East York
    - Mimico (includes Humber Bay Shores)
    - North St James Town
    - Some entries are NaN (which likely indicates missing data).

In [17]:
def harmonize_neighbourhood_names(name):
    if pd.isna(name):
        return name
    name = name.replace(' - ', '-').replace(' ', '_').strip()
    # Replace common abbrevations
    name = name.replace('St.', 'St').replace('Saint', 'St')
    return name


In [18]:
df_covid['Harmonized_Neighbourhood'] = df_covid['Neighbourhood Name'].apply(harmonize_neighbourhood_names)
gdf['Harmonized_AREA_NAME'] = gdf['Cleaned_AREA_NAME'].apply(harmonize_neighbourhood_names)

# Now check for matching names again
harmonized_covid_neighbourhoods = df_covid['Harmonized_Neighbourhood'].unique()
harmonized_shapefile_neighbourhoods = gdf['Harmonized_AREA_NAME'].unique()

# Find the common and non-matching neighbourhood names
common_neighbourhoods_harmonized = set(harmonized_covid_neighbourhoods).intersection(harmonized_shapefile_neighbourhoods)
non_matching_neighbourhoods_covid_harmonized = set(harmonized_covid_neighbourhoods) - common_neighbourhoods_harmonized
non_matching_neighbourhoods_shapefile_harmonized = set(harmonized_shapefile_neighbourhoods) - common_neighbourhoods_harmonized

common_neighbourhoods_harmonized, non_matching_neighbourhoods_covid_harmonized, non_matching_neighbourhoods_shapefile_harmonized



({'Agincourt_North',
  'Agincourt_South-Malvern_West',
  'Alderwood',
  'Annex',
  'Banbury-Don_Mills',
  'Bathurst_Manor',
  'Bay_Street_Corridor',
  'Bayview_Village',
  'Bayview_Woods-Steeles',
  'Bedford_Park-Nortown',
  'Beechborough-Greenbrook',
  'Bendale',
  'Birchcliffe-Cliffside',
  'Black_Creek',
  'Blake-Jones',
  'Briar_Hill-Belgravia',
  'Bridle_Path-Sunnybrook-York_Mills',
  'Broadview_North',
  'Brookhaven-Amesbury',
  'Caledonia-Fairbank',
  'Casa_Loma',
  'Centennial_Scarborough',
  'Church-Yonge_Corridor',
  'Clairlea-Birchmount',
  'Clanton_Park',
  'Cliffcrest',
  'Corso_Italia-Davenport',
  'Danforth',
  'Don_Valley_Village',
  'Dorset_Park',
  'Dovercourt-Wallace_Emerson-Junction',
  'Downsview-Roding-CFB',
  'Dufferin_Grove',
  'East_End-Danforth',
  'Edenbridge-Humber_Valley',
  'Eglinton_East',
  'Elms-Old_Rexdale',
  'Englemount-Lawrence',
  'Eringate-Centennial-West_Deane',
  'Etobicoke_West_Mall',
  'Flemingdon_Park',
  'Forest_Hill_North',
  'Forest_Hill_S

# Final step
step has successfully matched almost all the neighbourhood names between the COVID-19 data and the shapefile data. The only remaining discrepancies are:

In the COVID-19 data: mimicoincludeshumberbayshores and some entries are NaN.
In the shapefile data: mimico.
The mimicoincludeshumberbayshores in the COVID-19 data seems to correspond to mimico in the shapefile data, with additional descriptive text in parentheses. We can now consider these two to be a match as well.




In [19]:
import re
# re is a regular expression operation

def final_harmonization(name):
    if pd.isna(name):
        return name
    name = re.sub(r'[^\w]', '', name.replace(' ', '').lower())
    return name

In [20]:
df_covid['Final_Harmonized_Neighbourhood'] = df_covid['Harmonized_Neighbourhood'].apply(final_harmonization)
gdf['Final_Harmonized_AREA_NAME'] = gdf['Harmonized_AREA_NAME'].apply(final_harmonization)

# Now check for matching names again
final_harmonized_covid_neighbourhoods = df_covid['Final_Harmonized_Neighbourhood'].unique()
final_harmonized_shapefile_neighbourhoods = gdf['Final_Harmonized_AREA_NAME'].unique()

# Find the common and non-matching neighbourhood names
final_common_neighbourhoods = set(final_harmonized_covid_neighbourhoods).intersection(final_harmonized_shapefile_neighbourhoods)
final_non_matching_neighbourhoods_covid = set(final_harmonized_covid_neighbourhoods) - final_common_neighbourhoods
final_non_matching_neighbourhoods_shapefile = set(final_harmonized_shapefile_neighbourhoods) - final_common_neighbourhoods

final_common_neighbourhoods, final_non_matching_neighbourhoods_covid, final_non_matching_neighbourhoods_shapefile



({'agincourt_north',
  'agincourt_southmalvern_west',
  'alderwood',
  'annex',
  'banburydon_mills',
  'bathurst_manor',
  'bay_street_corridor',
  'bayview_village',
  'bayview_woodssteeles',
  'bedford_parknortown',
  'beechboroughgreenbrook',
  'bendale',
  'birchcliffecliffside',
  'black_creek',
  'blakejones',
  'briar_hillbelgravia',
  'bridle_pathsunnybrookyork_mills',
  'broadview_north',
  'brookhavenamesbury',
  'caledoniafairbank',
  'casa_loma',
  'centennial_scarborough',
  'churchyonge_corridor',
  'clairleabirchmount',
  'clanton_park',
  'cliffcrest',
  'corso_italiadavenport',
  'danforth',
  'don_valley_village',
  'dorset_park',
  'dovercourtwallace_emersonjunction',
  'downsviewrodingcfb',
  'dufferin_grove',
  'east_enddanforth',
  'edenbridgehumber_valley',
  'eglinton_east',
  'elmsold_rexdale',
  'englemountlawrence',
  'eringatecentennialwest_deane',
  'etobicoke_west_mall',
  'flemingdon_park',
  'forest_hill_north',
  'forest_hill_south',
  'glenfieldjane_h

In [21]:
from folium.features import GeoJsonTooltip

merged_data = gdf.merge(df_covid, left_on='Final_Harmonized_AREA_NAME', right_on='Final_Harmonized_Neighbourhood')
geojs_data = merged_data.to_json()
map = folium.Map(location=[43.6532, -79.3832], zoom_start=12)

In [22]:
total_cases_per_neighbourhood = df_covid.groupby('Final_Harmonized_Neighbourhood').size().reset_index(name='Total COVID Cases')

# Merge this count back with your geographic data
merged_data = gdf.merge(total_cases_per_neighbourhood, left_on='Final_Harmonized_AREA_NAME', right_on='Final_Harmonized_Neighbourhood')


In [23]:
print(merged_data.columns)
# Simplify the geometries to improve performance
gdf['geometry'] = gdf['geometry'].simplify(tolerance=0.001, preserve_topology=True).buffer(0)


Index(['AREA_S_CD', 'AREA_NAME', 'geometry', 'Cleaned_AREA_NAME',
       'Harmonized_AREA_NAME', 'Final_Harmonized_AREA_NAME',
       'Final_Harmonized_Neighbourhood', 'Total COVID Cases'],
      dtype='object')


In [27]:
if 'Total COVID Cases' not in gdf.columns:
    covid_cases_counts = df_covid.groupby(
        'Final_Harmonized_Neighbourhood').size()
    covid_cases_counts = covid_cases_counts.reset_index(
        name='Total COVID Cases')
    # Merge the counts with the geographic data
    gdf = gdf.merge(covid_cases_counts, left_on='Final_Harmonized_AREA_NAME',
                    right_on='Final_Harmonized_Neighbourhood')

    # Now create the tooltip column
gdf['tooltip'] = gdf.apply(
    lambda x: f"{x['Final_Harmonized_AREA_NAME']} - {x['Total COVID Cases']} cases", axis=1)

# Check the first few entries in the tooltip column
gdf[['Final_Harmonized_AREA_NAME', 'Total COVID Cases', 'tooltip']].head()

Unnamed: 0,Final_Harmonized_AREA_NAME,Total COVID Cases,tooltip
0,yongestclair,1273,yongestclair - 1273 cases
1,york_university_heights,5626,york_university_heights - 5626 cases
2,lansingwestgate,1605,lansingwestgate - 1605 cases
3,yorkdaleglen_park,2983,yorkdaleglen_park - 2983 cases
4,stonegatequeensway,2868,stonegatequeensway - 2868 cases


In [28]:
# Base map
tdot_map = folium.Map(location=[43.7, -79.4], zoom_start=12)

# Choropleth
choropleth = folium.Choropleth(geo_data=geojs_data,
                               data=merged_data,
                               columns=['Final_Harmonized_Neighbourhood',
                                        'Total COVID Cases'],
                               key_on="feature.properties.Final_Harmonized_Neighbourhood",).add_to(tdot_map)

# Add tooltip
folium.LayerControl().add_to(tdot_map)
choropleth.geojson.add_child(
    folium.features.GeoJsonTooltip(['tooltip'], labels=False)
)


tdot_map.save('covid-19-cases.html')