In [None]:
# Install shapely to deal with geospatial data
!pip install shapely

In [None]:
!kaggle datasets download -d dhruvildave/ookla-internet-speed-dataset -f "2020-q2/2020-04-01_performance_mobile_tiles.parquet"
!unzip 2020-04-01_performance_mobile_tiles.parquet.zip
!rm 2020-04-01_performance_mobile_tiles.parquet.zip

In [None]:
# initlize pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("internet-analysis-clustering").getOrCreate()

In [None]:
# Needed to make Jupyter work with Gitpod
import plotly.io as pio
pio.renderers.default = 'iframe_connected'

In [None]:
# Read the data into a dataframe and print the schema
df = spark.read.parquet("2020-04-01_performance_mobile_tiles.parquet")
df.printSchema()

# Print the first 5 rows of the dataframe
df.show(5)

In [None]:
from shapely import wkt
from pyspark.sql.types import StringType
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf


@udf(returnType=DoubleType())
def longitude(polygon: str):
    return wkt.loads(polygon).centroid.x

@udf(returnType=DoubleType())
def latitude(polygon):
    return wkt.loads(polygon).centroid.y

In [None]:
# Adds two columns: longitude and latitude
df = df.withColumn("longitude", longitude(df.tile))
df = df.withColumn("latitude", latitude(df.tile))

df.show(5)
df.printSchema()

In [None]:
# Remove the "quadkey" and "tile" columns
df = df.drop("tile")
df = df.drop("quadkey")
df.printSchema()


# Drop null values
df = df.dropna()
df.count()

# Preparing the Data

As usual, we generate the features vector using VectorAssembler

Then we apply MinMax Normalization using MinMaxNormalizer

In [None]:
# Pipeline
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["avg_d_kbps", "avg_u_kbps", "tests", "devices", "latitude", "longitude"], outputCol="features")

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
# Import pipeline
from pyspark.ml import Pipeline

kmeans = KMeans().setK(3)

pipeline = Pipeline(stages=[assembler, kmeans])

model = pipeline.fit(df)

# Make predictions
predictions = model.transform(df)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

print("Silhouette with squared euclidean distance = ", evaluator.evaluate(predictions))
