In [2]:
# Install shapely to deal with geospatial data
!pip install shapely



In [1]:
!kaggle datasets download -d dhruvildave/ookla-internet-speed-dataset -f "2020-q2/2020-04-01_performance_mobile_tiles.parquet"
!unzip 2020-04-01_performance_mobile_tiles.parquet.zip
!rm 2020-04-01_performance_mobile_tiles.parquet.zip

Downloading 2020-04-01_performance_mobile_tiles.parquet.zip to /workspace/pyspark-airline-delay-classification
 94%|█████████████████████████████████████▋  | 241M/256M [00:05<00:00, 40.2MB/s]
100%|████████████████████████████████████████| 256M/256M [00:05<00:00, 51.1MB/s]
Archive:  2020-04-01_performance_mobile_tiles.parquet.zip
replace 2020-04-01_performance_mobile_tiles.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [8]:
# initlize pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("internet-analysis-clustering").getOrCreate()

In [9]:
# Needed to make Jupyter work with Gitpod
import plotly.io as pio
pio.renderers.default = 'iframe_connected'

In [11]:
# Read the data into a dataframe and print the schema
df = spark.read.parquet("2020-04-01_performance_mobile_tiles.parquet")
df.printSchema()

# Print the first 5 rows of the dataframe
df.show(5)

root
 |-- quadkey: string (nullable = true)
 |-- tile: string (nullable = true)
 |-- avg_d_kbps: long (nullable = true)
 |-- avg_u_kbps: long (nullable = true)
 |-- avg_lat_ms: long (nullable = true)
 |-- tests: long (nullable = true)
 |-- devices: long (nullable = true)

+----------------+--------------------+----------+----------+----------+-----+-------+
|         quadkey|                tile|avg_d_kbps|avg_u_kbps|avg_lat_ms|tests|devices|
+----------------+--------------------+----------+----------+----------+-----+-------+
|1203022122320032|POLYGON((24.09301...|     28772|      3165|        34|    8|      1|
|0313113213321131|POLYGON((-1.49963...|     20782|     10180|        54|    2|      2|
|1221210331312333|POLYGON((30.88806...|     22690|     22416|       449|    6|      2|
|1200312211223323|POLYGON((18.00109...|     54493|      4635|        21|    2|      2|
|0302233220203221|POLYGON((-81.5130...|     90669|      6576|        21|    1|      1|
+----------------+-------------

In [12]:
from shapely import wkt
from pyspark.sql.types import StringType
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf


@udf(returnType=DoubleType())
def longitude(polygon: str):
    return wkt.loads(polygon).centroid.x

@udf(returnType=DoubleType())
def latitude(polygon):
    return wkt.loads(polygon).centroid.y

In [13]:
# Adds two columns: longitude and latitude
df = df.withColumn("longitude", longitude(df.tile))
df = df.withColumn("latitude", latitude(df.tile))

df.show(5)
df.printSchema()

+----------------+--------------------+----------+----------+----------+-----+-------+------------------+------------------+
|         quadkey|                tile|avg_d_kbps|avg_u_kbps|avg_lat_ms|tests|devices|         longitude|          latitude|
+----------------+--------------------+----------+----------+----------+-----+-------+------------------+------------------+
|1203022122320032|POLYGON((24.09301...|     28772|      3165|        34|    8|      1| 24.09576416015625| 49.88224742799456|
|0313113213321131|POLYGON((-1.49963...|     20782|     10180|        54|    2|      2| -1.49688720703125|52.953602268373295|
|1221210331312333|POLYGON((30.88806...|     22690|     22416|       449|    6|      2| 30.89080810546875|29.919232776382895|
|1200312211223323|POLYGON((18.00109...|     54493|      4635|        21|    2|      2| 18.00384521484375|59.356996008027856|
|0302233220203221|POLYGON((-81.5130...|     90669|      6576|        21|    1|      1|-81.51031494140625|41.317012752730506|


In [14]:
# Remove the "quadkey" and "tile" columns
df = df.drop("tile")
df = df.drop("quadkey")
df.printSchema()


# Drop null values
df = df.dropna()
df.count()

root
 |-- avg_d_kbps: long (nullable = true)
 |-- avg_u_kbps: long (nullable = true)
 |-- avg_lat_ms: long (nullable = true)
 |-- tests: long (nullable = true)
 |-- devices: long (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)



                                                                                

4075861

In [16]:
# Pipeline
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["avg_d_kbps", "avg_u_kbps", "tests", "devices", "latitude", "longitude"], outputCol="features")

In [17]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
# Import pipeline
from pyspark.ml import Pipeline

kmeans = KMeans().setK(3)

pipeline = Pipeline(stages=[assembler, kmeans])

model = pipeline.fit(df)

# Make predictions
predictions = model.transform(df)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

print("Silhouette with squared euclidean distance = ", evaluator.evaluate(predictions))


22/05/13 21:01:26 WARN TaskMemoryManager: Failed to allocate a page (4194304 bytes), try again.
22/05/13 21:01:29 WARN TaskMemoryManager: Failed to allocate a page (4194304 bytes), try again.
22/05/13 21:01:30 WARN TaskMemoryManager: Failed to allocate a page (4194304 bytes), try again.
22/05/13 21:01:31 WARN TaskMemoryManager: Failed to allocate a page (4194304 bytes), try again.
22/05/13 21:01:32 WARN TaskMemoryManager: Failed to allocate a page (4194304 bytes), try again.
22/05/13 21:01:33 WARN TaskMemoryManager: Failed to allocate a page (4194304 bytes), try again.
22/05/13 21:01:36 WARN TaskMemoryManager: Failed to allocate a page (4194304 bytes), try again.
22/05/13 21:01:41 WARN TaskMemoryManager: Failed to allocate a page (4194304 bytes), try again.
22/05/13 21:01:40 WARN TaskMemoryManager: Failed to allocate a page (4194304 bytes), try again.
22/05/13 21:01:38 WARN TaskMemoryManager: Failed to allocate a page (4194304 bytes), try again.
22/05/13 21:01:45 WARN TaskMemoryManager