
# Speedtest Data of Philippines - Datasets from Ookla, GADM Maps and NTC


### Import libraries to be used for this project: numpy, pandas and geopandas

In [1]:
from datetime import datetime

import geopandas as gp
import pandas as pd
import numpy as np
import sweetviz as sv

### Load the Philippine Boundaries from GADM.org as a GeoDataFrame (DataFrame with geometry column). For this project, we are going to use the Shapefile with Provinces Layer. We checked the DataFrame for null values and found no null values on the NAME_1 and geometry columns which is going to be used to be merged later with the Ookla data

In [2]:
pinas = gp.GeoDataFrame.from_file("gadm36_PHL_shp.zip", layer="gadm36_PHL_1")
pinas

Unnamed: 0,GID_0,NAME_0,GID_1,NAME_1,VARNAME_1,NL_NAME_1,TYPE_1,ENGTYPE_1,CC_1,HASC_1,geometry
0,PHL,Philippines,PHL.1_1,Abra,,,Lalawigan|Probinsya,Province,1401,PH.AB,"POLYGON ((120.77473 17.16318, 120.76637 17.159..."
1,PHL,Philippines,PHL.2_1,Agusan del Norte,,,Lalawigan|Probinsya,Province,1602,PH.AN,"POLYGON ((125.44627 8.66631, 125.44199 8.66333..."
2,PHL,Philippines,PHL.3_1,Agusan del Sur,,,Lalawigan|Probinsya,Province,1603,PH.AS,"POLYGON ((125.90923 7.99845, 125.90176 7.99794..."
3,PHL,Philippines,PHL.4_1,Aklan,,,Lalawigan|Probinsya,Province,604,PH.AK,"MULTIPOLYGON (((122.42083 11.63194, 122.42000 ..."
4,PHL,Philippines,PHL.5_1,Albay,,,Lalawigan|Probinsya,Province,505,PH.AL,"MULTIPOLYGON (((123.28764 13.04923, 123.28686 ..."
...,...,...,...,...,...,...,...,...,...,...,...
76,PHL,Philippines,PHL.77_1,Tawi-Tawi,,,Lalawigan|Probinsya,Province,1570,PH.TT,"MULTIPOLYGON (((119.46694 4.58694, 119.46639 4..."
77,PHL,Philippines,PHL.78_1,Zambales,,,Lalawigan|Probinsya,Province,155,PH.PN,"MULTIPOLYGON (((120.08285 14.75048, 120.08222 ..."
78,PHL,Philippines,PHL.79_1,Zamboanga del Norte,,,Lalawigan|Probinsya,Province,972,PH.ZN,"MULTIPOLYGON (((122.09467 7.53152, 122.09467 7..."
79,PHL,Philippines,PHL.80_1,Zamboanga del Sur,,,Lalawigan|Probinsya,Province,973,PH.ZS,"MULTIPOLYGON (((122.29816 6.87506, 122.29816 6..."


In [3]:
pinas.isnull().sum()

GID_0         0
NAME_0        0
GID_1         0
NAME_1        0
VARNAME_1    78
NL_NAME_1    81
TYPE_1        0
ENGTYPE_1     0
CC_1          0
HASC_1        0
geometry      0
dtype: int64

### Load the data from Ookla for Fixed Broadband. For this project, we are going to use a single file from a quarter of the year 2019 for the purpose of simulating the process. We also checked for nulls in this DataFrame and found no nulls.  

In [4]:
q2m2020 = gp.GeoDataFrame.from_file("2020mobile_q2.zip")
print(q2m2020.head())
print(q2m2020.shape)

            quadkey  avg_d_kbps  avg_u_kbps  avg_lat_ms  tests  devices  \
0  1203022122320032       28772        3165          34      8        1   
1  0313113213321131       20782       10180          54      2        2   
2  1221210331312333       22690       22416         449      6        2   
3  1200312211223323       54493        4635          21      2        2   
4  0302233220203221       90669        6576          21      1        1   

                                            geometry  
0  POLYGON ((24.09302 49.88402, 24.09851 49.88402...  
1  POLYGON ((-1.49963 52.95526, -1.49414 52.95526...  
2  POLYGON ((30.88806 29.92161, 30.89355 29.92161...  
3  POLYGON ((18.00110 59.35840, 18.00659 59.35840...  
4  POLYGON ((-81.51306 41.31908, -81.50757 41.319...  
(4075861, 7)


In [None]:
q2m2020.isnull().sum()

### Join the Ookla Datasets with Philippine Boundaries to get the geometries of Philippine Provinces with at least 1 tile. 

In [6]:
ph_q2m2020 = gp.sjoin(pinas[['NAME_1','geometry']], q2m2020, how="inner", predicate='intersects')
print(ph_q2m2020.head())
print(ph_q2m2020.shape)

  NAME_1                                           geometry  index_right  \
0   Abra  POLYGON ((120.77473 17.16318, 120.76637 17.159...      1261541   
0   Abra  POLYGON ((120.77473 17.16318, 120.76637 17.159...      1967053   
0   Abra  POLYGON ((120.77473 17.16318, 120.76637 17.159...      1253810   
0   Abra  POLYGON ((120.77473 17.16318, 120.76637 17.159...      1398167   
0   Abra  POLYGON ((120.77473 17.16318, 120.76637 17.159...      1085273   

            quadkey  avg_d_kbps  avg_u_kbps  avg_lat_ms  tests  devices  
0  1323012313010210         659          89          74      1        1  
0  1323012311223331         115          73          82      2        1  
0  1323012311233012         567        5522          33      5        1  
0  1323012311223110       46888       17430          32      8        2  
0  1323012311233010        1767       14049          33      3        2  
(43610, 9)


In [7]:
ph_q2m2020.isnull().sum()

NAME_1         0
geometry       0
index_right    0
quadkey        0
avg_d_kbps     0
avg_u_kbps     0
avg_lat_ms     0
tests          0
devices        0
dtype: int64

### Next step would be to merge the data with the list of provinces and regions to in order to match a certain province to its corresponding region. 

In [8]:
phregions = pd.read_csv("provreg.csv")
phregions

Unnamed: 0,NAME_1,Region
0,Abra,CAR
1,Agusan del Norte,Region XIII
2,Agusan del Sur,Region XIII
3,Aklan,Region VI
4,Albay,Region V
...,...,...
77,Zambales,Region III
78,Zamboanga del Norte,Region IX
79,Zamboanga del Sur,Region IX
80,Zamboanga Sibugay,Region IX


In [9]:
ph_q2m2020_regions = pd.merge(ph_q2m2020,phregions, on="NAME_1", how="inner")


### We added columns to convert the average upload and download speeds from KBPS to MBPS for the purpose of data simplification.

In [10]:
ph_q2m2020_regions['avg_d_mbps'] = ph_q2m2020_regions['avg_d_kbps'] / 1000
ph_q2m2020_regions['avg_u_mbps'] = ph_q2m2020_regions['avg_u_kbps'] / 1000
ph_q2m2020_regions.head()
ph_q2m2020_regions.shape

(42572, 12)

### We used groupby "Region" to further simplify the data into regions, and aggregate statistics to get the Mean of Upload Speed, Download Speeds and Latency, and the Sum of Number of Tests and Devices per Province. Using the aggregate function also drops the unnecessary columns in the process such as geometry and index columns and would only return specified columns in the expression. We then renamed the columns for proper presentation using array and rename function.

In [11]:
d = {'tests': 'Tests (Q2-2020)', 'devices': 'Devices (Q2-2020)', 'avg_lat_ms':'Ave Latency(ms) (Q2-2020)','avg_d_mbps':'Ave Download(Mbps) (Q2-2020)','avg_u_mbps':'Ave Upload(Mbps) (Q2-2020)'}
ph_q2m2020_regions_grp = ph_q2m2020_regions.groupby('Region').agg({'tests':'sum', 'devices':'sum','avg_lat_ms':'mean','avg_d_mbps':'mean', 'avg_u_mbps':'mean'}).rename(columns=d)
ph_q2m2020_regions_grp



Unnamed: 0_level_0,Tests (Q2-2020),Devices (Q2-2020),Ave Latency(ms) (Q2-2020),Ave Download(Mbps) (Q2-2020),Ave Upload(Mbps) (Q2-2020)
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ARMM,5679,1540,62.764811,6.486278,5.363266
CAR,9908,3226,59.975,6.811072,4.386259
NCR,255267,76951,32.713349,19.970162,6.483845
Region I,24692,7505,43.607672,9.488924,5.105348
Region II,13420,4687,47.470912,9.69683,5.652175
Region III,106067,31067,44.986689,10.725784,5.198725
Region IV-A,174195,47330,43.329813,10.893979,5.12733
Region IV-B,8582,2515,57.086505,7.562458,5.831773
Region IX,10660,2848,50.334858,9.756499,6.580231
Region V,15860,5188,49.251701,7.492146,5.746057


In [12]:
ph_q2m2020_regions_grp.to_csv("ph_q2m2020_regions_grp.csv")
ph_q2m2020_regions_grp.dtypes

Tests (Q2-2020)                   int64
Devices (Q2-2020)                 int64
Ave Latency(ms) (Q2-2020)       float64
Ave Download(Mbps) (Q2-2020)    float64
Ave Upload(Mbps) (Q2-2020)      float64
dtype: object