
# Speedtest Data of Philippines - Datasets from Ookla, GADM Maps and NTC


### Import libraries to be used for this project: numpy, pandas and geopandas

In [53]:
from datetime import datetime

import geopandas as gp
import pandas as pd
import numpy as np
import sweetviz as sv

### Load the Philippine Boundaries from GADM.org as a GeoDataFrame (DataFrame with geometry column). For this project, we are going to use the Shapefile with Provinces Layer. We checked the DataFrame for null values and found no null values on the NAME_1 and geometry columns which is going to be used to be merged later with the Ookla data

In [54]:
pinas = gp.GeoDataFrame.from_file("gadm36_PHL_shp.zip", layer="gadm36_PHL_1")
pinas

Unnamed: 0,GID_0,NAME_0,GID_1,NAME_1,VARNAME_1,NL_NAME_1,TYPE_1,ENGTYPE_1,CC_1,HASC_1,geometry
0,PHL,Philippines,PHL.1_1,Abra,,,Lalawigan|Probinsya,Province,1401,PH.AB,"POLYGON ((120.77473 17.16318, 120.76637 17.159..."
1,PHL,Philippines,PHL.2_1,Agusan del Norte,,,Lalawigan|Probinsya,Province,1602,PH.AN,"POLYGON ((125.44627 8.66631, 125.44199 8.66333..."
2,PHL,Philippines,PHL.3_1,Agusan del Sur,,,Lalawigan|Probinsya,Province,1603,PH.AS,"POLYGON ((125.90923 7.99845, 125.90176 7.99794..."
3,PHL,Philippines,PHL.4_1,Aklan,,,Lalawigan|Probinsya,Province,604,PH.AK,"MULTIPOLYGON (((122.42083 11.63194, 122.42000 ..."
4,PHL,Philippines,PHL.5_1,Albay,,,Lalawigan|Probinsya,Province,505,PH.AL,"MULTIPOLYGON (((123.28764 13.04923, 123.28686 ..."
...,...,...,...,...,...,...,...,...,...,...,...
76,PHL,Philippines,PHL.77_1,Tawi-Tawi,,,Lalawigan|Probinsya,Province,1570,PH.TT,"MULTIPOLYGON (((119.46694 4.58694, 119.46639 4..."
77,PHL,Philippines,PHL.78_1,Zambales,,,Lalawigan|Probinsya,Province,155,PH.PN,"MULTIPOLYGON (((120.08285 14.75048, 120.08222 ..."
78,PHL,Philippines,PHL.79_1,Zamboanga del Norte,,,Lalawigan|Probinsya,Province,972,PH.ZN,"MULTIPOLYGON (((122.09467 7.53152, 122.09467 7..."
79,PHL,Philippines,PHL.80_1,Zamboanga del Sur,,,Lalawigan|Probinsya,Province,973,PH.ZS,"MULTIPOLYGON (((122.29816 6.87506, 122.29816 6..."


In [3]:
pinas.isnull().sum()

GID_0         0
NAME_0        0
GID_1         0
NAME_1        0
VARNAME_1    78
NL_NAME_1    81
TYPE_1        0
ENGTYPE_1     0
CC_1          0
HASC_1        0
geometry      0
dtype: int64

### Load the data from Ookla for Fixed Broadband. For this project, we are going to use a single file from a quarter of the year 2019 for the purpose of simulating the process. We also checked for nulls in this DataFrame and found no nulls.  

In [55]:
q4m2019 = gp.GeoDataFrame.from_file("2019mobile_q4.zip")
print(q4m2019.head())
print(q4m2019.shape)

            quadkey  avg_d_kbps  avg_u_kbps  avg_lat_ms  tests  devices  \
0  1231213033210310        9168        2497          50     87       38   
1  0323002121232202       34506       18960          72     27       17   
2  3100131213200103       17265       11726          43    265       57   
3  2103111020030103       27042       23017          31      3        3   
4  1221012331221022       60573       38125          25      1        1   

                                            geometry  
0  POLYGON ((77.11304 28.43971, 77.11853 28.43971...  
1  POLYGON ((-65.65430 18.32846, -65.64880 18.328...  
2  POLYGON ((110.76965 -7.54221, 110.77515 -7.542...  
3  POLYGON ((-47.69714 -22.67992, -47.69165 -22.6...  
4  POLYGON ((30.62988 36.93672, 30.63538 36.93672...  
(3799244, 7)


In [56]:
q4m2019.isnull().sum()

quadkey       0
avg_d_kbps    0
avg_u_kbps    0
avg_lat_ms    0
tests         0
devices       0
geometry      0
dtype: int64

### Join the Ookla Datasets with Philippine Boundaries to get the geometries of Philippine Provinces with at least 1 tile. 

In [57]:
ph_q4m2019 = gp.sjoin(pinas[['NAME_1','geometry']], q4m2019, how="inner", predicate='intersects')
print(ph_q4m2019.head())
print(ph_q4m2019.shape)

  NAME_1                                           geometry  index_right  \
0   Abra  POLYGON ((120.77473 17.16318, 120.76637 17.159...      1637824   
0   Abra  POLYGON ((120.77473 17.16318, 120.76637 17.159...      1737207   
0   Abra  POLYGON ((120.77473 17.16318, 120.76637 17.159...      1186305   
0   Abra  POLYGON ((120.77473 17.16318, 120.76637 17.159...      1717286   
0   Abra  POLYGON ((120.77473 17.16318, 120.76637 17.159...      1111777   

            quadkey  avg_d_kbps  avg_u_kbps  avg_lat_ms  tests  devices  
0  1323012313102022        4707        7858          53      1        1  
0  1323012313002103       11582        7178          46     22       18  
0  1323012313002100        3290        3948          48      5        3  
0  1323012313002111       13992        7877          70      8        2  
0  1323012313001332       11806        6032          19      1        1  
(28923, 9)


In [58]:
ph_q4m2019.isnull().sum()

NAME_1         0
geometry       0
index_right    0
quadkey        0
avg_d_kbps     0
avg_u_kbps     0
avg_lat_ms     0
tests          0
devices        0
dtype: int64

### Next step would be to merge the data with the list of provinces and regions to in order to match a certain province to its corresponding region. 

In [59]:
phregions = pd.read_csv("provreg.csv")
phregions

Unnamed: 0,NAME_1,Region
0,Abra,CAR
1,Agusan del Norte,Region XIII
2,Agusan del Sur,Region XIII
3,Aklan,Region VI
4,Albay,Region V
...,...,...
77,Zambales,Region III
78,Zamboanga del Norte,Region IX
79,Zamboanga del Sur,Region IX
80,Zamboanga Sibugay,Region IX


In [60]:
ph_q4m2019_regions = pd.merge(ph_q4m2019,phregions, on="NAME_1", how="inner")
ph_q4m2019_regions

Unnamed: 0,NAME_1,geometry,index_right,quadkey,avg_d_kbps,avg_u_kbps,avg_lat_ms,tests,devices,Region
0,Abra,"POLYGON ((120.77473 17.16318, 120.76637 17.159...",1637824,1323012313102022,4707,7858,53,1,1,CAR
1,Abra,"POLYGON ((120.77473 17.16318, 120.76637 17.159...",1737207,1323012313002103,11582,7178,46,22,18,CAR
2,Abra,"POLYGON ((120.77473 17.16318, 120.76637 17.159...",1186305,1323012313002100,3290,3948,48,5,3,CAR
3,Abra,"POLYGON ((120.77473 17.16318, 120.76637 17.159...",1717286,1323012313002111,13992,7877,70,8,2,CAR
4,Abra,"POLYGON ((120.77473 17.16318, 120.76637 17.159...",1111777,1323012313001332,11806,6032,19,1,1,CAR
...,...,...,...,...,...,...,...,...,...,...
28240,Zamboanga Sibugay,"MULTIPOLYGON (((122.85805 7.27653, 122.85837 7...",1747798,1323213121231203,3686,578,106,2,1,Region IX
28241,Zamboanga Sibugay,"MULTIPOLYGON (((122.85805 7.27653, 122.85837 7...",3398903,1323213121323003,37347,22680,28,1,1,Region IX
28242,Zamboanga Sibugay,"MULTIPOLYGON (((122.85805 7.27653, 122.85837 7...",3493675,1323213121230122,10423,10594,31,1,1,Region IX
28243,Zamboanga Sibugay,"MULTIPOLYGON (((122.85805 7.27653, 122.85837 7...",3473477,1323213121301302,6330,27623,62,1,1,Region IX


### We added columns to convert the average upload and download speeds from KBPS to MBPS for the purpose of data simplification.

In [61]:
ph_q4m2019_regions['avg_d_mbps'] = ph_q4m2019_regions['avg_d_kbps'] / 1000
ph_q4m2019_regions['avg_u_mbps'] = ph_q4m2019_regions['avg_u_kbps'] / 1000
ph_q4m2019_regions.head()
ph_q4m2019_regions.shape

(28245, 12)

### We used groupby "Region" to further simplify the data into regions, and aggregate statistics to get the Mean of Upload Speed, Download Speeds and Latency, and the Sum of Number of Tests and Devices per Province. Using the aggregate function also drops the unnecessary columns in the process such as geometry and index columns and would only return specified columns in the expression. We then renamed the columns for proper presentation using array and rename function.

In [62]:
d = {'tests': 'Tests (Q4-2019)', 'devices': 'Devices (Q4-2019)', 'avg_lat_ms':'Ave Latency(ms) (Q4-2019)','avg_d_mbps':'Ave Download(Mbps) (Q4-2019)','avg_u_mbps':'Ave Upload(Mbps) (Q4-2019)'}
ph_q4m2019_regions_grp = ph_q4m2019_regions.groupby('Region').agg({'tests':'sum', 'devices':'sum','avg_lat_ms':'mean','avg_d_mbps':'mean', 'avg_u_mbps':'mean'}).rename(columns=d)
ph_q4m2019_regions_grp



Unnamed: 0_level_0,Tests (Q4-2019),Devices (Q4-2019),Ave Latency(ms) (Q4-2019),Ave Download(Mbps) (Q4-2019),Ave Upload(Mbps) (Q4-2019)
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ARMM,2013,706,68.943709,8.977278,6.076411
CAR,4779,2766,48.615101,12.919834,6.868661
NCR,129214,72121,35.025286,22.680549,7.957683
Region I,9902,5362,46.726946,15.308816,7.748805
Region II,4973,2687,57.442228,15.652032,8.374384
Region III,36976,19980,47.626112,16.241158,7.947769
Region IV-A,54371,28281,46.228031,15.788708,7.270606
Region IV-B,2929,1564,47.978528,12.908262,7.37535
Region IX,3861,1876,65.468278,13.582476,7.117621
Region V,6830,3487,50.971356,12.597311,7.242243


In [63]:
ph_q4m2019_regions_grp.to_csv("ph_q4m2019_regions_grp.csv")
ph_q4m2019_regions_grp.dtypes

Tests (Q4-2019)                   int64
Devices (Q4-2019)                 int64
Ave Latency(ms) (Q4-2019)       float64
Ave Download(Mbps) (Q4-2019)    float64
Ave Upload(Mbps) (Q4-2019)      float64
dtype: object