# Summary

Starting point for a supervised learning model for Ookla speed tiles. The data comes from a combination of 
Ookla Open Data speed tests and Statistics Canada information, including 2016 census population data and census boundaries (shapefiles). 


REQUIRED: Loading the features table generated after running the feature_table_generate.py. 

In [None]:
#Loading the pickle file

ifile = bz2.BZ2File("../data/feature_table.pickle",'rb')
features_table = pickle.load(ifile)

#As we want to wokr with the MBps speed adding a column with MBPS of up and down streams.
features_table["avg_d_mbps"]=features_table["avg_d_kbps"]//1000.0
features_table["avg_u_mbps"]=features_table["avg_u_kbps"]//1000.0
features_table.describe()
#features_table.to_csv("Features.csv")

In [None]:
#Finding unique values for each columns
# for i in range(0,38):
#     print("-------Finding unique values in column ----------- "+ features_table.columns[i])
#     print(features_table[features_table.columns[i]].unique())

In [None]:
import numpy as np 
from pandas import DataFrame
import seaborn as sns
#features_table_realtion = features_table.drop(columns=["quadkey","geometry"])
features_table_realtion = features_table.filter(['avg_d_mbps','avg_u_mbps','tests','devices','POP_DENSITY'], axis=1).head(1000)
sns.heatmap(features_table_realtion, annot=True)

In [None]:
#Mean and Standard dev for each provience along with total size of location in each provience0) 
Feature_all_downspeed = features_table.groupby("PRNAME")["avg_d_mbps"].agg(['size','mean','std']).reset_index()
Feature_all_downspeed.columns = ["Proviences","Size_Total","Mean_Download_Speed","std_Download_Speed"]
Feature_all_downspeed.head(13)

In [None]:
Feature_all_upspeed = features_table.groupby("PRNAME")["avg_u_mbps"].agg(['size','mean','std']).reset_index()
Feature_all_upspeed.columns = ["Proviences","Size_Total","Mean_Upload_Speed","STD_Upload_Speed"]
Feature_all_upspeed.head(13)

#TODO: Size total will be used to find the gap between total location having internet vs Total location not meeting the speed criteria the GAP.

frames=[Feature_all_downspeed,Feature_all_upspeed]
result = pd.concat(frames,axis=1)
result = result.T.drop_duplicates().T
result.head(13)

In [None]:
#Similarly repeating process for internet criteria
#Finding how many locations does not meet the criteria of min up/download speed by provience
Query_up_down_speed = features_table.query('avg_d_mbps < 50 | avg_u_mbps < 10')

#For Download speeed
Query_downspeed = Query_up_down_speed.groupby("PRNAME")["avg_d_mbps"].agg(['size','mean','std',]).reset_index()
Query_downspeed.columns = ["Proviences","Crt_Size_Total","Crt_Mean_Download_Speed","Crt_std_Download_Speed"]
Query_downspeed.head(30)

In [None]:
#For upload speed
Query_upspeed = Query_up_down_speed.groupby("PRNAME")["avg_u_mbps"].agg(['size','mean','std']).reset_index()
Query_upspeed.columns = ["Proviences","Crt_Size_Total","Crt_Mean_Up_Speed","Crt_std_Up_Speed"]
Query_upspeed.head(30)

In [None]:
frames_crt=[Query_downspeed,Query_upspeed]
result_crt = pd.concat(frames_crt,axis=1)
result_crt = result_crt.T.drop_duplicates().T
result_crt.head(13)

In [None]:
#Merging boeth the result tables having all the mean,std,size details for actual and expected criteria
frames_final=[result,result_crt]
result_find_gap = pd.concat(frames_final,axis=1)
result_find_gap = result_find_gap.T.drop_duplicates().T
result_find_gap.head(13)

In [None]:
#Adding percentage column to find the gap and better understanding
result_find_gap["Percentage_gap"] = (result_find_gap["Crt_Size_Total"]/result_find_gap["Size_Total"])*100
result_find_gap.to_csv("Gap Analysis.csv")
result_find_gap.sort_values(by="Percentage_gap", ascending=False,inplace=True)
result_find_gap.head(13)

import plotly.express as px
fig = px.bar(result_find_gap, x='Proviences',y='Percentage_gap')
fig.show()

In [28]:
for_visualization_gap = pd.read_csv("../data/Gap_Analysis.csv")
for_visualization_gap.head()

Unnamed: 0.1,Unnamed: 0,Proviences,Size_Total,Mean_Download_Speed,std_Download_Speed,Mean_Upload_Speed,STD_Upload_Speed,Crt_Size_Total,Crt_Mean_Download_Speed,Crt_std_Download_Speed,Crt_Mean_Up_Speed,Crt_std_Up_Speed
0,0,Alberta,46933,88.96022,99.179564,33.174589,51.976972,28645,27.33587,29.772997,6.957689,8.556512
1,1,British Columbia / Colombie-Britannique,37060,150.566838,124.436208,64.355235,72.662968,11520,30.925174,34.779705,9.908247,12.759966
2,2,Manitoba,20194,81.436318,86.908821,31.959493,59.413703,12260,30.634502,33.202787,6.107259,11.330834
3,3,New Brunswick / Nouveau-Brunswick,13229,141.915715,138.299939,39.408723,58.84547,5370,28.741527,38.468404,5.584171,11.586166
4,4,Newfoundland and Labrador / Terre-Neuve-et-Lab...,6747,141.556544,120.608244,43.766859,63.201958,2912,56.487294,81.856997,5.619849,8.362949


In [29]:
#Dumping the generated table to pickle file
import bz2
import pickle
ofile = bz2.BZ2File("../data/Gap_Analysis.pickle",'wb')
pickle.dump(for_visualization_gap,ofile)

In [30]:
ifile = bz2.BZ2File("../data/Gap_Analysis.pickle",'rb')
for_visualization = pickle.load(ifile)

In [31]:
for_visualization

Unnamed: 0.1,Unnamed: 0,Proviences,Size_Total,Mean_Download_Speed,std_Download_Speed,Mean_Upload_Speed,STD_Upload_Speed,Crt_Size_Total,Crt_Mean_Download_Speed,Crt_std_Download_Speed,Crt_Mean_Up_Speed,Crt_std_Up_Speed
0,0,Alberta,46933,88.96022,99.179564,33.174589,51.976972,28645,27.33587,29.772997,6.957689,8.556512
1,1,British Columbia / Colombie-Britannique,37060,150.566838,124.436208,64.355235,72.662968,11520,30.925174,34.779705,9.908247,12.759966
2,2,Manitoba,20194,81.436318,86.908821,31.959493,59.413703,12260,30.634502,33.202787,6.107259,11.330834
3,3,New Brunswick / Nouveau-Brunswick,13229,141.915715,138.299939,39.408723,58.84547,5370,28.741527,38.468404,5.584171,11.586166
4,4,Newfoundland and Labrador / Terre-Neuve-et-Lab...,6747,141.556544,120.608244,43.766859,63.201958,2912,56.487294,81.856997,5.619849,8.362949
5,5,Northwest Territories / Territoires du Nord-Ouest,381,67.425197,52.31411,19.385827,32.127963,188,29.989362,28.431546,7.648936,9.970767
6,6,Nova Scotia / Nouvelle-Écosse,15962,129.114522,115.591789,46.401704,65.876107,6678,43.063193,63.232726,6.681342,10.638982
7,7,Nunavut,87,15.252874,29.994658,3.54023,9.317157,83,10.192771,8.200919,2.819277,8.744538
8,8,Ontario,108300,89.650194,101.872474,29.222068,56.631427,64688,25.32009,31.405652,4.632668,7.76883
9,9,Prince Edward Island / Île-du-Prince-Édouard,4358,96.897659,101.607281,42.96994,78.797689,2438,34.650943,50.58211,5.567268,9.595865
