# Summary
This notebook loads some of the previously converted Ookla speed test data from the Canada region 
and calculates new aggregates (average, totals) of these tiles against new geometric boundaries.
In this case, the averages are calulated based on "Dissemination Areas" which are 
smaller administrative boundaries than census sub-divisions as defined by Statistics Canada. 
This has the advantage of being the most granular level at which Statistics Canada also 
reports information on demographics (e.g. populations, ages, income, etc.). 

Running this notebook requires the quarterly ookla tiles filtered to the Canada region (under `data\ookla-canada-tiles`). 
as well as the `data/census-2021` data. This data is stored on swift, or can be regenerated with 
the notebooks in this repository (see Readme.md for more information).

In [1]:
import sys

sys.path.append("../..")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gp

import src.datasets.loading.ookla
import src.datasets.loading.statcan
import src.datasets.joins



In [43]:
from src.config import DATA_DIRECTORY

DATA_DIRECTORY.resolve()

PosixPath('/Users/zacharyshand/ookla-statcan-analysis/data')

# Data Loading

In [2]:
tiles_csv_file = DATA_DIRECTORY / "ookla-canada-tiles/all-tile-labels.csv"
tile_geoms_file = DATA_DIRECTORY / "ookla-canada-tiles/canada-tiles"

census_data_file = DATA_DIRECTORY / "census-2021/Census-Subset-FlatAllRegions.csv"

In [4]:
# Load all the geometry files
tile_geom = gp.read_file(tile_geoms_file)
tile_geom = tile_geom.to_crs("EPSG:4326")

das_geom = src.datasets.loading.statcan.boundary("dissemination_areas")
das_geom = das_geom.to_crs("EPSG:4326")

tile_geom.quadkey = tile_geom.quadkey.astype(str)
das_geom.DAUID = das_geom.DAUID.astype(str)

pc_geom = src.datasets.loading.statcan.boundary("population_centres")
pc_geom = pc_geom.to_crs("EPSG:4326").loc[
    :, ["PCUID", "PCNAME", "PCTYPE", "PCPUID", "PCCLASS", "geometry"]
]

# Load purely tabular data
census_data = pd.read_csv(census_data_file)
census_data.ALT_GEO_CODE = census_data.ALT_GEO_CODE.astype(str)

speed_data = src.datasets.loading.ookla.speed_data()
speed_data.quadkey = speed_data.quadkey.astype(str)

# Geometric Overlaps and Spatial Joins
Needed to calculate where speed tiles are compared to Statistics Canada administrative boundaries (dissemination areas and population centers)

In [5]:
# Geometric calculation of which tiles overlap with DAs
da_tile_join = das_geom.sjoin(tile_geom, how="inner")

In [6]:
# Geometric calculation of which DAs are primarily in pop centers.
conic_crs = src.datasets.loading.statcan.boundary("population_centres").crs
da_pc_overlay = src.datasets.joins.overlays.overlay(das_geom, pc_geom, conic_crs)
## use CRS with meters to avoid warnings related to area
da_pc_overlay = da_pc_overlay.dropna(subset=["DAUID"])
da_pc_classes = (
    da_pc_overlay.sort_values(by=["DAUID", "left_frac"], ascending=[True, False])
    .drop_duplicates(subset=["DAUID"], keep="first")
    .loc[:, ["DAUID", "PCUID", "PCNAME", "PCTYPE", "PCPUID", "PCCLASS"]]
)

  ol = gp.overlay(left, right, how="union")


Unnamed: 0,DAUID,PCUID,PCNAME,PCTYPE,PCPUID,PCCLASS
33,10010165,0792,St. John's,1,100792,4
34,10010166,0792,St. John's,1,100792,4
35,10010167,0792,St. John's,1,100792,4
36,10010168,0792,St. John's,1,100792,4
37,10010169,0792,St. John's,1,100792,4
...,...,...,...,...,...,...
44359,62080024,1392,Cambridge Bay,4,621392,2
61835,62080025,,,,,
61836,62080026,,,,,
61842,62080027,,,,,


In [12]:
# Re-aggregate tiles in new DAs.
das_speeds = pd.merge(
    left=speed_data, right=da_tile_join.loc[:, ["DAUID", "quadkey"]], how="inner"
)

grp = das_speeds.groupby(["DAUID", "year", "quarter", "conn_type"])
das_speeds = pd.DataFrame(
    [
        grp["avg_d_kbps"].mean(),
        grp["avg_u_kbps"].mean(),
        grp["avg_lat_ms"].mean(),
        grp["tests"].sum(),
        grp["devices"].sum(),
    ]
).T
# Adjust index to NA fill DAs with no speed test information
# in any of the conn_type or year/quarter combinations.
cross_index = pd.MultiIndex.from_product(
    [
        das_geom.DAUID,
        speed_data.year.drop_duplicates(),
        speed_data.quarter.drop_duplicates(),
        speed_data.conn_type.drop_duplicates(),
    ]
)
das_speeds = das_speeds.reindex(cross_index)
das_speeds = das_speeds.reset_index()

# Data Samples
Four intermediate data tables are made. Three of which have 1-to-1 mapping on the DAUID: das_geom, census_data, da_pc_classes.
The last table, the das_speeds has a many-to-1 relationship to the DAUID values in these other 3 tables.
Only the das_geom table contains geometry information.

In [16]:
das_speeds.head(5)

Unnamed: 0,DAUID,year,quarter,conn_type,avg_d_kbps,avg_u_kbps,avg_lat_ms,tests,devices
0,10010244,2019,1,fixed,67011.8,44715.0,8.4,58.0,36.0
1,10010244,2019,1,mobile,68257.0,51528.0,64.0,1.0,1.0
2,10010244,2019,2,fixed,81122.8,58796.0,9.6,90.0,40.0
3,10010244,2019,2,mobile,147485.0,18384.0,66.5,3.0,3.0
4,10010244,2019,3,fixed,63661.888889,45392.0,12.777778,184.0,88.0


In [17]:
das_geom.head(3)

Unnamed: 0,DAUID,PRUID,PRNAME,CDUID,CDNAME,CDTYPE,CCSUID,CCSNAME,CSDUID,CSDNAME,...,SACCODE,SACTYPE,CMAUID,CMAPUID,CMANAME,CMATYPE,CTUID,CTNAME,ADAUID,geometry
0,10010244,10,Newfoundland and Labrador / Terre-Neuve-et-Lab...,1001,Division No. 1,CDR,1001519,St. John's,1001519,St. John's,...,1,1,1,10001,St. John's,B,10004.0,4.0,10010014,"POLYGON ((-52.76815 47.55803, -52.76902 47.557..."
1,10010245,10,Newfoundland and Labrador / Terre-Neuve-et-Lab...,1001,Division No. 1,CDR,1001519,St. John's,1001519,St. John's,...,1,1,1,10001,St. John's,B,10016.02,16.02,10010010,"POLYGON ((-52.75476 47.56543, -52.75689 47.564..."
2,10010246,10,Newfoundland and Labrador / Terre-Neuve-et-Lab...,1001,Division No. 1,CDR,1001519,St. John's,1001519,St. John's,...,1,1,1,10001,St. John's,B,10016.02,16.02,10010010,"POLYGON ((-52.75059 47.56404, -52.75143 47.564..."


In [18]:
census_data.head(5)

Unnamed: 0,CENSUS_YEAR,DGUID,ALT_GEO_CODE,GEO_LEVEL,GEO_NAME,Pop_2021,Pop_2016,Pop_change_percent,Private_Dwellings,Pop_density_square_km,...,Income_count_25_percent,Mean_income_count,Mean_income,After-tax_mean_income_count,After-tax_mean_income,Household_income_count,Median_household_income,Median_household_after-tax_income,Indigenous_identity_count_total,Indigenous_identity_count
0,2021,2021A000011124,1,Country,Canada,36991981.0,35151728.0,5.2,16284235.0,4.2,...,30335920.0,29242935.0,54450.0,29257325.0,44920.0,14978940.0,84000.0,73000.0,36328480.0,1807250.0
1,2021,2021A000224,24,Province,Quebec,8501833.0,8164361.0,4.1,4050164.0,6.5,...,6918730.0,6738410.0,51160.0,6740360.0,41840.0,3749035.0,72500.0,63200.0,8308480.0,205010.0
2,2021,2021A00032401,2401,Census division,"Communauté maritime des Îles-de-la-Madeleine, ...",12654.0,12475.0,1.4,6665.0,67.5,...,11040.0,10955.0,48680.0,10955.0,40480.0,5830.0,75500.0,66000.0,12455.0,255.0
3,2021,2021A00052401023,2401023,Census subdivision,"Les Îles-de-la-Madeleine, Municipalité (MÉ)",12190.0,12010.0,1.5,6413.0,78.6,...,10555.0,10465.0,48240.0,10470.0,40240.0,5610.0,75000.0,65500.0,11915.0,225.0
4,2021,2021S051224010019,24010019,Dissemination area,24010019,536.0,,,283.0,75.0,...,490.0,490.0,53800.0,490.0,43600.0,255.0,74000.0,65000.0,565.0,0.0


In [20]:
da_pc_classes.head(2)

Unnamed: 0,DAUID,PCUID,PCNAME,PCTYPE,PCPUID,PCCLASS
33,10010165,792,St. John's,1,100792,4
34,10010166,792,St. John's,1,100792,4


# Merge Data

In [22]:
das_information = pd.merge(das_geom, da_pc_classes)
das_information = pd.merge(
    das_information, census_data, how="left", left_on="DAUID", right_on="ALT_GEO_CODE"
)
das_information.columns

Index(['DAUID', 'PRUID', 'PRNAME', 'CDUID', 'CDNAME', 'CDTYPE', 'CCSUID',
       'CCSNAME', 'CSDUID', 'CSDNAME', 'CSDTYPE', 'ERUID', 'ERNAME', 'SACCODE',
       'SACTYPE', 'CMAUID', 'CMAPUID', 'CMANAME', 'CMATYPE', 'CTUID', 'CTNAME',
       'ADAUID', 'geometry', 'PCUID', 'PCNAME', 'PCTYPE', 'PCPUID', 'PCCLASS',
       'CENSUS_YEAR', 'DGUID', 'ALT_GEO_CODE', 'GEO_LEVEL', 'GEO_NAME',
       'Pop_2021', 'Pop_2016', 'Pop_change_percent', 'Private_Dwellings',
       'Pop_density_square_km', 'Land_area_square_km', 'Income_count_total',
       'Median_income_count', 'Median_income', 'After-tax_income-count',
       'After-tax_income', 'Income_count_25_percent', 'Mean_income_count',
       'Mean_income', 'After-tax_mean_income_count', 'After-tax_mean_income',
       'Household_income_count', 'Median_household_income',
       'Median_household_after-tax_income', 'Indigenous_identity_count_total',
       'Indigenous_identity_count'],
      dtype='object')

In [24]:
speed_data_annotated = pd.merge(das_information, das_speeds)
speed_data_annotated.columns

Index(['DAUID', 'PRUID', 'PRNAME', 'CDUID', 'CDNAME', 'CDTYPE', 'CCSUID',
       'CCSNAME', 'CSDUID', 'CSDNAME', 'CSDTYPE', 'ERUID', 'ERNAME', 'SACCODE',
       'SACTYPE', 'CMAUID', 'CMAPUID', 'CMANAME', 'CMATYPE', 'CTUID', 'CTNAME',
       'ADAUID', 'geometry', 'PCUID', 'PCNAME', 'PCTYPE', 'PCPUID', 'PCCLASS',
       'CENSUS_YEAR', 'DGUID', 'ALT_GEO_CODE', 'GEO_LEVEL', 'GEO_NAME',
       'Pop_2021', 'Pop_2016', 'Pop_change_percent', 'Private_Dwellings',
       'Pop_density_square_km', 'Land_area_square_km', 'Income_count_total',
       'Median_income_count', 'Median_income', 'After-tax_income-count',
       'After-tax_income', 'Income_count_25_percent', 'Mean_income_count',
       'Mean_income', 'After-tax_mean_income_count', 'After-tax_mean_income',
       'Household_income_count', 'Median_household_income',
       'Median_household_after-tax_income', 'Indigenous_identity_count_total',
       'Indigenous_identity_count', 'year', 'quarter', 'conn_type',
       'avg_d_kbps', 'avg_u_kbp

# Save Data
Saving the merge on the quarterly speed data is avoided because saving/loading geographic boundaries is slow and 
this merged data unnecessarily duplicates this geographic boundary data.

In [45]:
(DATA_DIRECTORY / "hackathon").mkdir(exist_ok=True)
das_information.to_file(DATA_DIRECTORY / "hackathon" / "geometry.gpkg", driver="GPKG")
das_speeds.to_csv(DATA_DIRECTORY / "hackathon" / "speeds.csv", index=False)