In [1]:
# Install shapely to deal with geospatial data
!pip install shapely



In [6]:
!kaggle datasets download -d dhruvildave/ookla-internet-speed-dataset -f "2021-q1/2021-01-01_performance_mobile_tiles.parquet"
!unzip 2021-01-01_performance_mobile_tiles.parquet.zip
!rm 2021-01-01_performance_mobile_tiles.parquet.zip

2021-01-01_performance_mobile_tiles.parquet.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  2021-01-01_performance_mobile_tiles.parquet.zip
  inflating: 2021-01-01_performance_mobile_tiles.parquet  


In [1]:
# initlize pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("airline-delay-analysis").getOrCreate()

Picked up JAVA_TOOL_OPTIONS:  -Xmx3435m
Picked up JAVA_TOOL_OPTIONS:  -Xmx3435m
22/05/13 16:11:53 WARN Utils: Your hostname, fawazalesay-pysparkairl-sdsd8mdblyh resolves to a loopback address: 127.0.0.1; using 10.0.5.2 instead (on interface ceth0)
22/05/13 16:11:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/13 16:11:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/13 16:11:55 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/05/13 16:11:55 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
# Needed to make Jupyter work with Gitpod
import plotly.io as pio
pio.renderers.default = 'iframe_connected'

In [5]:
# Read the data into a dataframe and print the schema
df = spark.read.parquet("2021-01-01_performance_mobile_tiles.parquet")
df.printSchema()

# Print the first 5 rows of the dataframe
df.show(5)

root
 |-- quadkey: string (nullable = true)
 |-- tile: string (nullable = true)
 |-- avg_d_kbps: long (nullable = true)
 |-- avg_u_kbps: long (nullable = true)
 |-- avg_lat_ms: long (nullable = true)
 |-- tests: long (nullable = true)
 |-- devices: long (nullable = true)

+----------------+--------------------+----------+----------+----------+-----+-------+
|         quadkey|                tile|avg_d_kbps|avg_u_kbps|avg_lat_ms|tests|devices|
+----------------+--------------------+----------+----------+----------+-----+-------+
|0320102333020210|POLYGON((-76.2780...|     60057|     18502|        33|    7|      4|
|1202323320003230|POLYGON((15.52368...|     32306|      1619|        22|    2|      1|
|0231320130101122|POLYGON((-98.8989...|     40500|      8097|        39|   60|      5|
|1321231220223030|POLYGON((120.9924...|     68423|     15722|        20|  143|     52|
|1220101000310231|POLYGON((14.34265...|     31295|      7302|        37|   47|     24|
+----------------+-------------

In [11]:
from shapely import wkt
from pyspark.sql.types import StringType
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf


@udf(returnType=DoubleType())
def longitude(polygon: str):
    return wkt.loads(polygon).centroid.x

@udf(returnType=DoubleType())
def latitude(polygon):
    return wkt.loads(polygon).centroid.y

In [14]:
# Adds two columns: longitude and latitude
df = df.withColumn("longitude", longitude(df.tile))
df = df.withColumn("latitude", latitude(df.tile))

df.show(5)
df.printSchema()

+----------------+--------------------+----------+----------+----------+-----+-------+------------------+------------------+
|         quadkey|                tile|avg_d_kbps|avg_u_kbps|avg_lat_ms|tests|devices|         longitude|          latitude|
+----------------+--------------------+----------+----------+----------+-----+-------+------------------+------------------+
|0320102333020210|POLYGON((-76.2780...|     60057|     18502|        33|    7|      4|-76.27532958984374|  36.7894910429454|
|1202323320003230|POLYGON((15.52368...|     32306|      1619|        22|    2|      1| 15.52642822265625| 41.44890271064655|
|0231320130101122|POLYGON((-98.8989...|     40500|      8097|        39|   60|      5|-98.89617919921875|  26.4140103937502|
|1321231220223030|POLYGON((120.9924...|     68423|     15722|        20|  143|     52|120.99517822265652|24.873978115094747|
|1220101000310231|POLYGON((14.34265...|     31295|      7302|        37|   47|     24| 14.34539794921875|  40.8200450541506|


In [17]:
# Remove the "quadkey" and "tile" columns
df = df.drop("tile")
df = df.drop("quadkey")
df.printSchema()


# Drop null values
df = df.dropna()
df.count()

root
 |-- avg_d_kbps: long (nullable = true)
 |-- avg_u_kbps: long (nullable = true)
 |-- avg_lat_ms: long (nullable = true)
 |-- tests: long (nullable = true)
 |-- devices: long (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)





In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

kmeans = KMeans().setK(3)

model = kmeans.fit(df)


