# Task

* create cluster of users based on the information about their location
* use location coordinates of places from local postgres database
* visualise the result of cluster analysis
* save the ml model for further use

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, count, explode

from pyspark.ml.feature import VectorAssembler, StandardScaler, MinMaxScaler
from pyspark.ml.clustering import KMeans

from pyspark.ml import Pipeline

import os
import pandas as pd
import geopandas
import matplotlib.pyplot as plt


In [None]:
spark = (
    SparkSession
    .builder
    .appName('Cluster Analysis II')
    .getOrCreate()
)

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

users_input_path = os.path.join(project_path, 'data/users')

In [None]:
driver = 'org.postgresql.Driver'
url = 'jdbc:postgresql://localhost/postgres'
table = 'public.locations'
user = 'postgres'
password = 'postgres'


locations = (
    spark
    .read
    .format('jdbc')
    .option("driver", driver)
    .option('url', url)
    .option('dbtable', table)
    .option('user', user)
    .option('password', password)
    .load()
)

In [None]:
usersDF = (
    spark
    .read
    .option('path', users_input_path)
    .load()
)

In [None]:
locations.show()

In [None]:
data_with_coord = (
    usersDF.alias('users')
    .join(locations.alias('locs'), col('users.location') == col('locs.name'))
    .select('user_id', 'location', 'latitude', 'longitude')
).cache()

In [None]:
data_with_coord.show()

In [None]:
features_array = ['latitude', 'longitude']
assembler = VectorAssembler(inputCols=features_array, outputCol='features')
kmeans = KMeans(featuresCol='features', predictionCol='predictions', k=6, seed=1)
pipeline = Pipeline(stages=[assembler, kmeans])
model = pipeline.fit(data_with_coord)

In [None]:
predictions = model.transform(data_with_coord)

In [None]:
(
    predictions
    .groupBy('predictions')
    .count()
).show()

In [None]:
(
    predictions
    .select('user_id', 'users.location', *features_array)
    .filter(col('predictions') == 0)
    .orderBy('location')
).show()

In [None]:
(
    predictions
    .select('user_id', 'users.location', *features_array)
    .filter(col('predictions') == 1)
).show()

In [None]:
(
    predictions
    .select('user_id', 'users.location', *features_array)
    .filter(col('predictions') == 2)
).show()

In [None]:
(
    predictions
    .select('user_id', 'users.location', *features_array)
    .filter(col('predictions') == 3)
).show()

In [None]:
(
    predictions
    .select('user_id', 'users.location', *features_array)
    .filter(col('predictions') == 4)
).show()

In [None]:
(
    predictions
    .select('user_id', 'users.location', *features_array)
    .filter(col('predictions') == 5)
).show()

In [None]:
local_clusters = predictions.select('users.location', *features_array, 'predictions').toPandas()

In [None]:
gdf = geopandas.GeoDataFrame(
    local_clusters, 
    geometry=geopandas.points_from_xy(local_clusters.longitude, local_clusters.latitude)
)

In [None]:
plt.rcParams["figure.figsize"] = (20,10)
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
ax = world.plot(color='white', edgecolor='black')

gdf[gdf['predictions'] == 0].plot(ax=ax, color='green')
gdf[gdf['predictions'] == 1].plot(ax=ax, color='blue')
gdf[gdf['predictions'] == 2].plot(ax=ax, color='red')
gdf[gdf['predictions'] == 3].plot(ax=ax, color='black')
gdf[gdf['predictions'] == 4].plot(ax=ax, color='yellow')
gdf[gdf['predictions'] == 5].plot(ax=ax, color='violet')

plt.show()

In [None]:
# save the result to the database:
(
    predictions
    .select(
        'user_id', 
        col('predictions').alias('cluster_id')
    )
    .write
    .mode('append')
    .format('jdbc')
    .option('url', url)
    .option('dbtable', 'user_clusters')
    .option('user', user)
    .option('password', password)
    .save()
)

In [None]:
spark.stop()