## Generating Data For Time Series Model (Alberta)

For simplicity I picked Alberta data to build a time series forecasting model for upcoming years. 

Some part of the codes are copied from *Simple Tile Speed Model.ipynb* file.

### 1. Packages

In [2]:
import sys
sys.path.append("..")
import src.config
from src.datasets.loading import statcan, ookla
import numpy as np 
import pandas as pd
import geopandas as gp

#from sklearn import preprocessing, pipeline, compose
#from sklearn import linear_model, model_selection, svm
#from sklearn import metrics

import matplotlib.pyplot as plt 

### 2. Data Loading

Following codes copied from *Simple Tile Speed Model.ipynb* notebook

In [3]:
ookla_tiles = ookla.canada_tiles()

In [4]:
da_pops = statcan.dissemination_areas_populations()

  return pd.read_csv(POP_FILE)


In [5]:
o = gp.read_file(src.config.OVERLAYS_DIR / 'tile_das_overlay') #this can take a few minutes to load.
tile_da_label = o.dropna(subset=['DAUID','quadkey']).sort_values(by=['quadkey','tile_frac'],ascending=False).drop_duplicates(subset='quadkey', keep='first')
tile_da_label['quadkey'] = tile_da_label['quadkey'].astype(int)
tile_da_label['DAUID'] = tile_da_label['DAUID'].astype(int)

In [6]:
all_quarters = ookla.speed_data(ookla.available_files().loc[('fixed',2019,1):('fixed',2023,1)].path)

In [37]:
all_quarters.head()

Unnamed: 0,quadkey,avg_d_kbps,avg_u_kbps,avg_lat_ms,tests,devices,conn_type,year,quarter
0,302303331012303,130704,26290,11,8,6,fixed,2019,1
1,212122230130321,100966,55007,8,118,23,fixed,2019,1
2,212120311013031,890,2432,362,5,2,fixed,2019,1
3,212102012132221,36347,11509,14,44,1,fixed,2019,1
4,212122232013120,59481,23797,10,45,13,fixed,2019,1


### 3. Data Wrangling

In [67]:
tile_da_label_subset = tile_da_label[['quadkey', 'PRUID']]
merged_df = all_quarters.merge(tile_da_label_subset, on='quadkey', how='left')

In [68]:
merged_df.columns

Index(['quadkey', 'avg_d_kbps', 'avg_u_kbps', 'avg_lat_ms', 'tests', 'devices',
       'conn_type', 'year', 'quarter', 'PRUID'],
      dtype='object')

In [69]:
merged_df.PRUID.unique()

array(['24', '59', '35', '48', '11', '46', '12', '13', '47', nan, '10',
       '61', '60', '62'], dtype=object)

In [70]:
merged_df[merged_df['PRUID'].isna()]

Unnamed: 0,quadkey,avg_d_kbps,avg_u_kbps,avg_lat_ms,tests,devices,conn_type,year,quarter,PRUID
473,212122221131202,184386,15984,10,1,1,fixed,2019,1,
4924,212122221131323,17384,14256,10,1,1,fixed,2019,1,
6897,302232121000001,165698,22965,28,88,2,fixed,2019,1,
7202,212122203210202,7124,4201,19,1,1,fixed,2019,1,
8468,302303321312313,74341,21353,15,2,2,fixed,2019,1,
...,...,...,...,...,...,...,...,...,...,...
3114375,303301203231010,62133,18841,64,2,1,fixed,2023,1,
3114376,303301203231112,24915,10131,97,1,1,fixed,2023,1,
3115076,303301220133003,301130,212592,8,6,2,fixed,2023,1,
3115148,303301221001202,328713,143597,6,1,1,fixed,2023,1,


In [71]:
merged_df = merged_df.dropna(subset=['PRUID'])

In [72]:
prov_dict = {10: 'NL', 11: 'PE', 12: 'NS', 13: 'NB', 24: 'QC', 35: 'ON', 46: 'MB', 47: 'SK', 48: 'AB', 59: 'BC', 60: 'YT', 61: 'NT', 62: 'NU'}

In [73]:
merged_df['PR'] = merged_df['PRUID'].astype(int).map(prov_dict)

In [79]:
merged_df = merged_df[['PR', 'avg_d_kbps', 'avg_u_kbps', 'quarter', 'year']]

In [80]:
df_grouped = merged_df.groupby(['PR', 'year', 'quarter']).mean().reset_index()
df_grouped.head()

Unnamed: 0,PR,year,quarter,avg_d_kbps,avg_u_kbps
0,AB,2019,1,64501.807089,15061.175467
1,AB,2019,2,65839.564439,16230.529401
2,AB,2019,3,69426.308808,18790.990094
3,AB,2019,4,71961.254161,21125.670596
4,AB,2020,1,73054.660884,21226.205331


Creating Province DataFrames

In [83]:
AB_df = df_grouped[df_grouped['PR'] == 'AB']

In [84]:
AB_df.shape

(17, 5)

In [90]:
AB_df

Unnamed: 0,PR,year,quarter,avg_d_kbps,avg_u_kbps
0,AB,2019,1,64501.807089,15061.175467
1,AB,2019,2,65839.564439,16230.529401
2,AB,2019,3,69426.308808,18790.990094
3,AB,2019,4,71961.254161,21125.670596
4,AB,2020,1,73054.660884,21226.205331
5,AB,2020,2,70591.225705,22041.207882
6,AB,2020,3,77523.109218,25959.033187
7,AB,2020,4,87178.815007,32084.795266
8,AB,2021,1,87970.058126,32130.989513
9,AB,2021,2,101021.247794,34229.928305


In [112]:
time_index = pd.date_range(start='2019-Q1', end='2023-Q2', freq='Q')
AB_avg_d_kbps = AB_df[['avg_d_kbps']]

In [113]:
AB_avg_d_kbps.set_index(time_index, inplace=True)

In [115]:
AB_avg_d_kbps.head()

Unnamed: 0,avg_d_kbps
2019-03-31,64501.807089
2019-06-30,65839.564439
2019-09-30,69426.308808
2019-12-31,71961.254161
2020-03-31,73054.660884


### 4. Storing Data For Future Use

In [116]:
import os

folder_name = "../data/model_data"

if not os.path.exists(folder_name):
        os.makedirs(folder_name)
        
df_grouped.to_csv(f'{folder_name}/speed_data_grouped.csv', index = False)
merged_df.to_csv(f'{folder_name}/speed_data_merged.csv', index = False)
AB_avg_d_kbps.to_csv(f'{folder_name}/Alberta_Speed_Data_Grouped.csv', index = False)