
# Speedtest Data of Philippines - Datasets from Ookla, GADM Maps and NTC


### Import libraries to be used for this project: numpy, pandas and geopandas

In [11]:
from datetime import datetime

import geopandas as gp
import pandas as pd
import numpy as np
import sweetviz as sv

### Load the Philippine Boundaries from GADM.org as a GeoDataFrame (DataFrame with geometry column). For this project, we are going to use the Shapefile with Provinces Layer. We checked the DataFrame for null values and found no null values on the NAME_1 and geometry columns which is going to be used to be merged later with the Ookla data

In [12]:
pinas = gp.GeoDataFrame.from_file("gadm36_PHL_shp.zip", layer="gadm36_PHL_1")
pinas

Unnamed: 0,GID_0,NAME_0,GID_1,NAME_1,VARNAME_1,NL_NAME_1,TYPE_1,ENGTYPE_1,CC_1,HASC_1,geometry
0,PHL,Philippines,PHL.1_1,Abra,,,Lalawigan|Probinsya,Province,1401,PH.AB,"POLYGON ((120.77473 17.16318, 120.76637 17.159..."
1,PHL,Philippines,PHL.2_1,Agusan del Norte,,,Lalawigan|Probinsya,Province,1602,PH.AN,"POLYGON ((125.44627 8.66631, 125.44199 8.66333..."
2,PHL,Philippines,PHL.3_1,Agusan del Sur,,,Lalawigan|Probinsya,Province,1603,PH.AS,"POLYGON ((125.90923 7.99845, 125.90176 7.99794..."
3,PHL,Philippines,PHL.4_1,Aklan,,,Lalawigan|Probinsya,Province,604,PH.AK,"MULTIPOLYGON (((122.42083 11.63194, 122.42000 ..."
4,PHL,Philippines,PHL.5_1,Albay,,,Lalawigan|Probinsya,Province,505,PH.AL,"MULTIPOLYGON (((123.28764 13.04923, 123.28686 ..."
...,...,...,...,...,...,...,...,...,...,...,...
76,PHL,Philippines,PHL.77_1,Tawi-Tawi,,,Lalawigan|Probinsya,Province,1570,PH.TT,"MULTIPOLYGON (((119.46694 4.58694, 119.46639 4..."
77,PHL,Philippines,PHL.78_1,Zambales,,,Lalawigan|Probinsya,Province,155,PH.PN,"MULTIPOLYGON (((120.08285 14.75048, 120.08222 ..."
78,PHL,Philippines,PHL.79_1,Zamboanga del Norte,,,Lalawigan|Probinsya,Province,972,PH.ZN,"MULTIPOLYGON (((122.09467 7.53152, 122.09467 7..."
79,PHL,Philippines,PHL.80_1,Zamboanga del Sur,,,Lalawigan|Probinsya,Province,973,PH.ZS,"MULTIPOLYGON (((122.29816 6.87506, 122.29816 6..."


In [13]:
pinas.isnull().sum()

GID_0         0
NAME_0        0
GID_1         0
NAME_1        0
VARNAME_1    78
NL_NAME_1    81
TYPE_1        0
ENGTYPE_1     0
CC_1          0
HASC_1        0
geometry      0
dtype: int64

# Load the data from Ookla for Mobile Broadband. For this project, we are going to use a single file from a quarter of the year 2020 for the purpose of simulating the process. We also checked for nulls in this DataFrame and found no nulls.  

In [4]:
q3m2020 = gp.GeoDataFrame.from_file("2020-07-01_performance_mobile_tiles.zip")
print(q3m2020.head())
print(q3m2020.shape)

            quadkey  avg_d_kbps  avg_u_kbps  avg_lat_ms  tests  devices  \
0  1203101011003021       31204       15767          23    180       24   
1  1202301033011222       78512        7859          19      1        1   
2  1202302310211330       12586        6035          44     30        4   
3  0231321020321002       55903       22395          32     16       10   
4  1203101011223332       25298       14883          21     21       18   

                                            geometry  
0  POLYGON ((37.66663 55.74567, 37.67212 55.74567...  
1  POLYGON ((15.24902 48.19905, 15.25452 48.19905...  
2  POLYGON ((13.52417 45.92823, 13.52966 45.92823...  
3  POLYGON ((-98.21777 26.18995, -98.21228 26.189...  
4  POLYGON ((37.69409 55.58145, 37.69958 55.58145...  
(4340413, 7)


In [14]:
q3m2020.isnull().sum()

NAME_1         0
geometry       0
index_right    0
quadkey        0
avg_d_kbps     0
avg_u_kbps     0
avg_lat_ms     0
tests          0
devices        0
dtype: int64

### Join the Ookla Datasets with Philippine Boundaries to get the geometries of Philippine Provinces with at least 1 tile. 

In [31]:
ph_q3m2020 = gp.sjoin(pinas[['NAME_1','geometry']], q3m2020, how="inner", predicate='intersects')
print(ph_q3m2020.head())
print(ph_q3m2020.shape)

     NAME_1_left                                           geometry  \
0           Abra  POLYGON ((120.77473 17.16318, 120.76637 17.159...   
6         Apayao  POLYGON ((121.43056 17.72600, 121.43214 17.724...   
17       Cagayan  MULTIPOLYGON (((121.41695 18.46305, 121.41695 ...   
33  Ilocos Norte  MULTIPOLYGON (((120.41228 17.91453, 120.41189 ...   
37       Kalinga  POLYGON ((121.46043 17.29165, 121.44605 17.293...   

    index_right NAME_1_right           quadkey  avg_d_kbps  avg_u_kbps  \
0             6       Apayao  1323013201022111         457        1071   
6             6       Apayao  1323013201022111         457        1071   
17            6       Apayao  1323013201022111         457        1071   
33            6       Apayao  1323013201022111         457        1071   
37            6       Apayao  1323013201022111         457        1071   

    avg_lat_ms  tests  devices  
0           48      1        1  
6           48      1        1  
17          48      1        

In [32]:
ph_q3m2020.isnull().sum()

NAME_1_left     0
geometry        0
index_right     0
NAME_1_right    0
quadkey         0
avg_d_kbps      0
avg_u_kbps      0
avg_lat_ms      0
tests           0
devices         0
dtype: int64

### Next step would be to merge the data with the list of provinces and regions to in order to match a certain province to its corresponding region. 

In [40]:
phregions = pd.read_excel("Regions.xlsx")
phregions

Unnamed: 0,NAME_1_left,Region
0,,
1,Abra,CAR
2,Agusan del Norte,Region XIII
3,Agusan del Sur,Region XIII
4,Aklan,Region VI
...,...,...
78,Zambales,Region III
79,Zamboanga del Norte,Region IX
80,Zamboanga del Sur,Region IX
81,Zamboanga Sibugay,Region IX


In [41]:
ph_q3m2020_regions = pd.merge(ph_q3m2020,phregions, on="NAME_1_left", how="inner")
ph_q3m2020_regions

Unnamed: 0,NAME_1_left,geometry,index_right,NAME_1_right,quadkey,avg_d_kbps,avg_u_kbps,avg_lat_ms,tests,devices,Region
0,Abra,"POLYGON ((120.77473 17.16318, 120.76637 17.159...",6,Apayao,1323013201022111,457,1071,48,1,1,CAR
1,Abra,"POLYGON ((120.77473 17.16318, 120.76637 17.159...",6,Apayao,1323013023001321,3992,4544,38,8,2,CAR
2,Abra,"POLYGON ((120.77473 17.16318, 120.76637 17.159...",6,Apayao,1323013023011200,776,2279,57,1,1,CAR
3,Abra,"POLYGON ((120.77473 17.16318, 120.76637 17.159...",6,Apayao,1323013023001313,8051,1631,62,2,1,CAR
4,Abra,"POLYGON ((120.77473 17.16318, 120.76637 17.159...",6,Apayao,1323013023010131,5638,2481,30,4,3,CAR
...,...,...,...,...,...,...,...,...,...,...,...
226997,Tawi-Tawi,"MULTIPOLYGON (((119.46694 4.58694, 119.46639 4...",76,Tawi-Tawi,1323230102303120,1923,3169,49,11,4,ARMM
226998,Tawi-Tawi,"MULTIPOLYGON (((119.46694 4.58694, 119.46639 4...",76,Tawi-Tawi,1323230102302110,557,5210,65,15,5,ARMM
226999,Tawi-Tawi,"MULTIPOLYGON (((119.46694 4.58694, 119.46639 4...",76,Tawi-Tawi,1323230103200022,229,364,87,1,1,ARMM
227000,Tawi-Tawi,"MULTIPOLYGON (((119.46694 4.58694, 119.46639 4...",76,Tawi-Tawi,1323230102303102,103,797,51,3,1,ARMM


### We added columns to convert the average upload and download speeds from KBPS to MBPS for the purpose of data simplification.

In [44]:
ph_q3m2020_regions['avg_d_mbps'] = ph_q3m2020_regions['avg_d_kbps'] / 1000
ph_q3m2020_regions['avg_u_mbps'] = ph_q3m2020_regions['avg_u_kbps'] / 1000
ph_q3m2020_regions.head()
ph_q3m2020_regions.shape

(227002, 13)

### We used groupby "Region" to further simplify the data into regions, and aggregate statistics to get the Mean of Upload Speed, Download Speeds and Latency, and the Sum of Number of Tests and Devices per Province. Using the aggregate function also drops the unnecessary columns in the process such as geometry and index columns and would only return specified columns in the expression. We then renamed the columns for proper presentation using array and rename function.

In [45]:
d = {'tests': 'Tests (Q3-2020)', 'devices': 'Devices (Q3-2020)', 'avg_lat_ms':'Ave Latency(ms) (Q3-2020)','avg_d_mbps':'Ave Download(Mbps) (Q3-2020)','avg_u_mbps':'Ave Upload(Mbps) (Q3-2020)'}
ph_q3m2020_regions_grp = ph_q3m2020_regions.groupby('Region').agg({'tests':'sum', 'devices':'sum','avg_lat_ms':'mean','avg_d_mbps':'mean', 'avg_u_mbps':'mean'}).rename(columns=d)
ph_q3m2020_regions_grp

Unnamed: 0_level_0,Tests (Q3-2020),Devices (Q3-2020),Ave Latency(ms) (Q3-2020),Ave Download(Mbps) (Q3-2020),Ave Upload(Mbps) (Q3-2020)
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ARMM,30751,9882,64.233102,8.055386,5.461908
CAR,108501,41961,41.163116,10.406235,5.16361
NCR,495537,176301,36.164086,15.306313,5.491619
Region I,123724,46885,39.619337,10.68578,4.975243
Region II,94669,37066,40.703603,10.694391,5.139716
Region III,925238,337437,37.585523,12.494809,5.13763
Region IV-A,1661234,590875,38.525333,13.664159,5.158357
Region IV-B,22164,6762,44.51878,9.752096,5.693409
Region IX,36057,13065,53.847745,10.245202,6.385814
Region V,77703,28035,43.325467,10.161774,5.412788


In [46]:
ph_q3m2020_regions_grp.to_csv("ph_q3m2020_regions_grp.csv")
ph_q3m2020_regions_grp.dtypes

Tests (Q3-2020)                   int64
Devices (Q3-2020)                 int64
Ave Latency(ms) (Q3-2020)       float64
Ave Download(Mbps) (Q3-2020)    float64
Ave Upload(Mbps) (Q3-2020)      float64
dtype: object