# Summary

This note book is going to create box plots for rural and non rural areas in each province for upload and download speeds, averaged throughout time.


## Start Running From Here:

In [4]:
import sys
sys.path.append("..")

In [5]:
import src.config

In [6]:
from src.datasets.loading import statcan, ookla

In [50]:
import numpy as np 
import pandas as pd
import geopandas as gp
import os
import plotly.graph_objects as go
import plotly.express as px

In [9]:
import matplotlib.pyplot as plt 

## Load 
Load some of the available data. The census population data and StatCan boundaries are automatically loaded from 
the StatCan website. The overlays and tile geometries/speeds need to pre-computed and saved to the overlays directory and data directories. 

### Load All Unique Tile Gemoetries

In [19]:
# the following is a geopandas datafarme of shape (670292, 2) 
ookla_tiles = ookla.canada_tiles()

### Load Census Population Information

In [20]:
# a pandas dataframe containting 3 columns
da_pops = statcan.dissemination_areas_populations()

### Labelling Tiles
Generate labels from geometric overlay of the Ookla tiles and Statistics Canada Dissemination Areas (DA). 
Label each tile with the information from the StatCan areas based on which DA the tile overlaps the most with.

In [22]:
o = gp.read_file(src.config.OVERLAYS_DIR / 'tile_das_overlay') #this can take a few minutes to load.
tile_da_label = o.dropna(subset=['DAUID','quadkey']).sort_values(by=['quadkey','tile_frac'],ascending=False).drop_duplicates(subset='quadkey', keep='first')
tile_da_label['quadkey'] = tile_da_label['quadkey'].astype(int)
tile_da_label['DAUID'] = tile_da_label['DAUID'].astype(int)

### Saving the CSVs so far

In [19]:
def save_df(dataframe, df_type:str, base_path: str, file_name: str):
    
    full_path = os.path.join(base_path, file_name)
    if df_type == "pd":
        dataframe.to_csv(full_path, header=True)
    elif df_type == "gp":
        dataframe.to_file(full_path, driver='GPKG')
    return

### Loading in the csv files for faster runtime

In [2]:
def load_csv_file(df_type:str, base_path: str, file_name:str):
    """
    load in the dataframe whether pandas or geopandas
    """
    
    full_path = os.path.join(base_path, file_name)
    if df_type == "pd":
        result = pd.read_csv(full_path)
    elif df_type == "gp":
        result = gp.read_file(full_path)
    return result
    

### Speed Test Data
Load in the previous 4 quarters of data. Since we're currently in Q3 of 2022, the most recent quarter is Q2 
so we can slice the files listed to grab those. Subsequently, we'll calculate weighted averages for individual tiles and use those as representative speeds for our model.

In [11]:
def get_speed_data_from_to(from_date:tuple, to_date:tuple):
    """
    Getting speed data inclusive from some date to some other date.
    """
    # get all data from starting time to ending time
    data_subset = ookla.speed_data(ookla.available_files().loc[from_date:to_date].path)
    # averaging speed test data over time over region
    down = data_subset.groupby('quadkey').apply(lambda s:np.average(s.avg_d_kbps, weights=s.tests)).rename('avg_d_kbps')
    up = data_subset.groupby('quadkey').apply(lambda s:np.average(s.avg_u_kbps, weights=s.tests)).rename('avg_u_kbps')
    tests = data_subset.groupby('quadkey')['tests'].sum()
    devices = data_subset.groupby('quadkey')['devices'].sum()
    averaged_over_region = pd.concat([down, up, tests, devices],axis=1)
    return averaged_over_region

In [12]:
time_subset = get_speed_data_from_to(from_date=('fixed',2019,1), to_date=('fixed',2023,1))

### Merge All The Data
It's a bit messy, but we're merging several tables and removing a few of the redundant or non-useful 
columns as we go through. At the end the `features_table` variable will have all of the 
tiles within census areas labelled by what type of Census Subdivision, Dissemination Area, Population Centre, etc. they are in, as well as population information for the DA (smallest area with populations available) and the speed test averages over the last 4 quarters.

In [15]:
## merge dissemination area (DA) populations with ookla tiles (already combined with other statcan data)
features_table = tile_da_label.merge(da_pops, on='DAUID', how='left')
features_table['DAPOP'] = features_table['DAPOP'].fillna(0).astype(int)
del features_table['GEO_NAME']
features_table = pd.DataFrame(features_table)
del features_table['geometry']
features_table['POP_DENSITY'] = features_table['DAPOP']/features_table['das_area']*1000**2 #people per square kilometer

# take all ookla tiles, merge the speeds data and tile labels and populations
features_table = ookla_tiles.merge(time_subset, on='quadkey').merge(features_table, on='quadkey')

# compute spatial joins to identify if area is a population centre
pop_info = statcan.boundary('population_centres').to_crs('epsg:4326')
pop_info = pop_info[['PCUID', 'PCNAME', 'PCTYPE', 'PCPUID', 'PCCLASS', 'geometry']] ##removes some redundant cols from DAs
features_table = features_table.sjoin(pop_info, how='left')
del features_table['index_right']
features_table = features_table.sort_values(by=['PCUID','quadkey']).drop_duplicates(subset=['quadkey']) #keep tiles where overlap was true

### Categorize All the Columns
All the columns from our joins above can be roughly split into categories based on the type of 
data and how you might use them in a simple supervised learning problem. These are broken down as follows:

In [16]:
pkey = 'quadkey'
geometry = 'geometry'
id_and_names = ['DAUID', 'CDUID', 'CDNAME', 'CCSUID', 'CSDNAME', 'CMAUID', 'CMAPUID', 'CMANAME', 
'CCSNAME', 'CSDUID', 'ERUID', 'ERNAME', 'CTUID', 'CTNAME', 'ADAUID', 
'PCUID', 'PCNAME', 'PCPUID', 'SACCODE',] ##SACCODE is half a category half ID values

categorical_labels = [
    #'PRUID', #PRUID is redundant with PRNAME
    'PRNAME', 'CDTYPE', 
    'CSDTYPE',  
    'SACTYPE', 
    'CMATYPE', 'PCTYPE', 'PCCLASS',
]
numerical_vars = [
    'tests', 'devices',
    'das_area', 'tile_area', 'tile_frac',  'das_frac', 
    'DAPOP','POP_DENSITY'
]
target_vars = ['avg_d_kbps', 'avg_u_kbps']

In [None]:
col_subset = [pkey] + categorical_labels + numerical_vars + target_vars
features_table.loc[:,col_subset].set_index('quadkey')

In [None]:
path_to_files = "../data/"
save_df(features_table, df_type="gp", base_path=path_to_files, file_name="features_table.gpkg")

# The end