In [1]:
# Install shapely to deal with geospatial data
!pip install shapely

Collecting shapely
  Downloading Shapely-1.8.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: shapely
Successfully installed shapely-1.8.2


In [7]:
!kaggle datasets download -d dhruvildave/ookla-internet-speed-dataset -f "2020-q2/2020-04-01_performance_mobile_tiles.parquet"
!unzip 2020-04-01_performance_mobile_tiles.parquet.zip
!rm 2020-04-01_performance_mobile_tiles.parquet.zip

2020-04-01_performance_mobile_tiles.parquet.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  2020-04-01_performance_mobile_tiles.parquet.zip
  inflating: 2020-04-01_performance_mobile_tiles.parquet  


In [3]:
# initlize pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("airline-delay-analysis").getOrCreate()

Picked up JAVA_TOOL_OPTIONS:  -Xmx3435m
Picked up JAVA_TOOL_OPTIONS:  -Xmx3435m
22/05/13 17:47:39 WARN Utils: Your hostname, fawazalesay-pysparkairl-ck4s7ekwcta resolves to a loopback address: 127.0.0.1; using 10.0.5.2 instead (on interface ceth0)
22/05/13 17:47:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/13 17:47:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Needed to make Jupyter work with Gitpod
import plotly.io as pio
pio.renderers.default = 'iframe_connected'

In [8]:
# Read the data into a dataframe and print the schema
df = spark.read.parquet("2020-04-01_performance_mobile_tiles.parquet")
df.printSchema()

# Print the first 5 rows of the dataframe
df.show(5)

root
 |-- quadkey: string (nullable = true)
 |-- tile: string (nullable = true)
 |-- avg_d_kbps: long (nullable = true)
 |-- avg_u_kbps: long (nullable = true)
 |-- avg_lat_ms: long (nullable = true)
 |-- tests: long (nullable = true)
 |-- devices: long (nullable = true)

+----------------+--------------------+----------+----------+----------+-----+-------+
|         quadkey|                tile|avg_d_kbps|avg_u_kbps|avg_lat_ms|tests|devices|
+----------------+--------------------+----------+----------+----------+-----+-------+
|1203022122320032|POLYGON((24.09301...|     28772|      3165|        34|    8|      1|
|0313113213321131|POLYGON((-1.49963...|     20782|     10180|        54|    2|      2|
|1221210331312333|POLYGON((30.88806...|     22690|     22416|       449|    6|      2|
|1200312211223323|POLYGON((18.00109...|     54493|      4635|        21|    2|      2|
|0302233220203221|POLYGON((-81.5130...|     90669|      6576|        21|    1|      1|
+----------------+-------------

In [9]:
from shapely import wkt
from pyspark.sql.types import StringType
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf


@udf(returnType=DoubleType())
def longitude(polygon: str):
    return wkt.loads(polygon).centroid.x

@udf(returnType=DoubleType())
def latitude(polygon):
    return wkt.loads(polygon).centroid.y

In [10]:
# Adds two columns: longitude and latitude
df = df.withColumn("longitude", longitude(df.tile))
df = df.withColumn("latitude", latitude(df.tile))

df.show(5)
df.printSchema()

[Stage 2:>                                                          (0 + 1) / 1]

+----------------+--------------------+----------+----------+----------+-----+-------+------------------+------------------+
|         quadkey|                tile|avg_d_kbps|avg_u_kbps|avg_lat_ms|tests|devices|         longitude|          latitude|
+----------------+--------------------+----------+----------+----------+-----+-------+------------------+------------------+
|1203022122320032|POLYGON((24.09301...|     28772|      3165|        34|    8|      1| 24.09576416015625| 49.88224742799456|
|0313113213321131|POLYGON((-1.49963...|     20782|     10180|        54|    2|      2| -1.49688720703125|52.953602268373295|
|1221210331312333|POLYGON((30.88806...|     22690|     22416|       449|    6|      2| 30.89080810546875|29.919232776382895|
|1200312211223323|POLYGON((18.00109...|     54493|      4635|        21|    2|      2| 18.00384521484375|59.356996008027856|
|0302233220203221|POLYGON((-81.5130...|     90669|      6576|        21|    1|      1|-81.51031494140625|41.317012752730506|


                                                                                

In [11]:
# Remove the "quadkey" and "tile" columns
df = df.drop("tile")
df = df.drop("quadkey")
df.printSchema()


# Drop null values
df = df.dropna()
df.count()

root
 |-- avg_d_kbps: long (nullable = true)
 |-- avg_u_kbps: long (nullable = true)
 |-- avg_lat_ms: long (nullable = true)
 |-- tests: long (nullable = true)
 |-- devices: long (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)



                                                                                

4075861

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

kmeans = KMeans().setK(3)

model = kmeans.fit(df)


