# Summary
Calculates aggregate internet speed test statistics (Ookla) over a modified version of the Canada hexagons. The speed test 
statistics are contrasted with the 50/10 Mbps access levels reported in the national broadband map. 

In particular, access levels calculated as a percentage of the speed tests that meet the 50/10 Mbps speed threshold are calculated. 
In some areas this level of "experienced" internet speeds is lower than the bucket (0%,25%,50%,75%,100%) assgined 
to the area in the broadband map. 

The speed test statistics do not match 100% with the 50/10 levels from the broadband map for a few reasons:

- Broadband map is nominal "access" to internet while Ookla speed tests are measured current speeds, and 
- Speed tests are quarterly _averages_ from small (~500m X 500m) areas, meaning we have averages of averages.

Together, the above indicate some discrepancies in directly comparing the two datasets; however, based on the trends in the speed test data 
which have significant skew, it suggests that many individuals or households have "realized" internet speeds 
below the evaluation indicated in the national broadband map.   

In [2]:
import sys
sys.path.append("..")

%load_ext autoreload
%autoreload 1
%aimport src.datasets.joins
%aimport src.datasets.loading.statcan

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from src.datasets.loading import statcan
from src.datasets.loading import ookla
from src.datasets import overlays

import statsmodels as sm
# import statsmodels.stats.weightstats
from scipy.stats import lognorm

In [5]:
from src.datasets import joins

In [6]:
statcan.download_map_data()

In [7]:
popctrs = statcan.boundary('population_centres')

In [8]:
o = joins.hexagons_popctrs_combined()

  ol = gp.overlay(left, right, how="union")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [9]:
tiles = ookla.canada_speed_tiles().loc[lambda s:(s.year==2022 ) & (s.conn_type=='fixed')]
tiles = tiles.to_crs(popctrs.crs)

In [10]:
o_statted = joins.add_simple_stats(o, tiles, 'HEXUID_PCPUID')       
o_statted = joins.add_50_10_stats(o_statted, tiles, 'HEXUID_PCPUID')
o_statted = o_statted.merge(statcan.hexagon_data(), on='HEXuid_HEXidu', how='left')

TypeError: unhashable type: 'list'

In [None]:
phh = statcan.phh_geometry().merge(statcan.phh_data(), on='PHH_ID')
o_statted = joins.add_phh_pop(o_statted, phh, 'HEXUID_PCPUID')

In [None]:
def breakpoint(value):
    if value == '>75% -  100%':
        return 0.75
    elif value == '>50% - 75%':
        return 0.5
    elif value == '>25% - 50%':
        return 0.25
    elif value == '>0% - 25%':
        return 0.0
    else:
        return 0.0

o_statted['access_level_numerical'] = o_statted.Avail_50_10_Gradient_Dispo.apply(breakpoint)

In [None]:
for col in o_statted:
    if "kbps" in col:
        o_statted[col] /= 1000
        o_statted.rename(columns={col:col.replace('kbps','Mbps')}, inplace=True)

In [None]:
def calculate_50_10_level(p50down, p10up):
    p = min(p50down, p10up)

    if p== 0:
        return '0% - Unavailable'
    elif p < 25:
        return '0%-25%'
    elif p < 5:
        return '25%-50%'
    elif p < 75:
        return '50%-75%'
    elif p < 100:
        return  '75%-100%'
    elif p ==100:
        return '100%'
    else:
        return 'Unknown'
    
def ookla_numerical(v):
    if v == '100%':
        return 1.0
    elif v == '75%-100%':
        return 0.75
    elif v == '50%-75%':
        return 0.5
    elif v == '25%-50%':
        return 0.25
    else:
        return 0.0

o_statted['Ookla_Avail_50_10'] = o_statted.apply(lambda s:calculate_50_10_level(s['50_down_percentile'],s['10_up_percentile']), axis=1)
#o_statted['Ookla_Avail_50_10_numerical'] = o_statted['Ookla_Avail_50_10'].apply(ookla_numerical)
o_statted['Ookla_Avail_50_10_numerical'] = o_statted.apply(lambda s:min(s['50_down_percentile'],s['10_up_percentile']), axis=1)


In [None]:
o_statted['is_overestimated'] = o_statted['Ookla_Avail_50_10_numerical'] < o_statted['access_level_numerical']

In [None]:
xmin, ymin, xmax, ymax = popctrs.loc[lambda s:s.PCNAME=="Brooks"].buffer(50_000).total_bounds
popup=['HEXUID_PCPUID', 'Pop2016',
    'Avail_50_10_Gradient_Dispo', 
    'Ookla_Avail_50_10', '50_down_percentile', '10_up_percentile',
    'is_overestimated',
    'avg_d_Mbps', 'min_d_Mbps', '25p_d_Mbps', '50p_d_Mbps', '75p_d_Mbps', 'max_d_Mbps', 
    'avg_u_Mbps', 'min_u_Mbps', '25p_u_Mbps', '50p_u_Mbps', '75p_u_Mbps', 'max_u_Mbps', 
    'avg_lat_ms', 
    'tests',
    'unique_devices', 
    'num_tiles',#'SumPop_2016_SommePop', 'SumURD_2016_SommeRH', 'SumTD_2016_SommeTL',
]

subset = o_statted.cx[xmin:xmax, ymin:ymax].loc[lambda s:(s.avg_d_Mbps > 0) | (s.Pop2016 > 0)]
subset.explore(
    'Ookla_Avail_50_10_numerical', 
    scheme='equalinterval', k = 4,
    tooltip=['HEXUID_PCPUID','Pop2016','tests','Avail_50_10_Gradient_Dispo','Ookla_Avail_50_10'],
    popup=popup)#cmap=['orange','green','yellow','blue','red','gray'])#, vmin=0, vmax=300)