# Task

* create cluster of users based on the information about their location
* use location coordinates of places from local postgres database
* visualise the result of cluster analysis
* save the ml model for further use

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, count, explode

from pyspark.ml.feature import VectorAssembler, StandardScaler, MinMaxScaler
from pyspark.ml.clustering import KMeans

from pyspark.ml import Pipeline, PipelineModel

import os
import pandas as pd
import geopandas
import matplotlib.pyplot as plt


In [None]:
spark = (
    SparkSession
    .builder
    .appName('Cluster Analysis I')
    .getOrCreate()
)

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

users_input_path = os.path.join(project_path, 'data/users')

model_output_path = os.path.join(project_path, 'output/models/clustering')

#### Get location coordinates

We have a table in postgresql database that contains this information. Connect to the database, read the table and create a DataFrame from it. In the next step we will join this information on users.

Hint:
* see how to connect to jdbc [here](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader.jdbc)
* you will use format `jdbc`
* you will need to provide driver, url, table, user and password in `option` (the values are provided bellow)

In [None]:
driver = 'org.postgresql.Driver'
url = 'jdbc:postgresql://localhost/postgres'
table = 'public.locations'
user = 'postgres'
password = 'postgres'


locations = (
    spark
    .read
    .format('jdbc')
    .option("driver", driver)
    .option('url', url)
    .option('dbtable', table)
    .option('user', user)
    .option('password', password)
    .load()
)

#### Create DataFrame from users:

In [None]:
usersDF = (
    spark
    .read
    .option('path', users_input_path)
    .load()
)

In [None]:
locations.show()

#### Join users with locations

* Create a DataFrame with following columns: `user_id`, `location`, `latitude`, `longitude`
* cache the DataFrame, since we will use it in more queries

In [None]:
data_with_coord = (
    usersDF.alias('users')
    .join(locations.alias('locs'), col('users.location') == col('locs.name'))
    .select('user_id', 'location', 'latitude', 'longitude')
).cache()

In [None]:
data_with_coord.show()

#### Create the model

* define the array of features (here we have only two freatures: latitude and longitude)
* use [VectorAssembler](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler) to create a vector from the features
* use [KMeans](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.clustering.KMeans) as the learning algorithm
* define the [Pipeline](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.Pipeline) (here it will have only two stages: assembler and kmeans)
* train the model

In [None]:
features_array = ['latitude', 'longitude']
assembler = VectorAssembler(inputCols=features_array, outputCol='features')
kmeans = KMeans(featuresCol='features', predictionCol='predictions', k=6, seed=1)
pipeline = Pipeline(stages=[assembler, kmeans])
model = pipeline.fit(data_with_coord)

#### Apply the model on the data

Create a new DataFrame called `predictions`

Hint:
* the model is a transformer, so you can call transform on it and pass the data
* this will add new column `predictions` which contains id of the cluster to which the record belongs

In [None]:
predictions = model.transform(data_with_coord)

#### See how big the clusters are

Hint:
* group by `predictions` and count

In [None]:
(
    predictions
    .groupBy('predictions')
    .count()
).show()

#### See specific cluster

Hint:
* filter for specific cluster, for example `col('predictions') == 0`

In [None]:
(
    predictions
    .select('user_id', 'users.location', *features_array)
    .filter(col('predictions') == 0)
    .orderBy('location')
).show()

In [None]:
(
    predictions
    .select('user_id', 'users.location', *features_array)
    .filter(col('predictions') == 1)
).show()

In [None]:
(
    predictions
    .select('user_id', 'users.location', *features_array)
    .filter(col('predictions') == 2)
).show()

In [None]:
(
    predictions
    .select('user_id', 'users.location', *features_array)
    .filter(col('predictions') == 3)
).show()

In [None]:
(
    predictions
    .select('user_id', 'users.location', *features_array)
    .filter(col('predictions') == 4)
).show()

In [None]:
(
    predictions
    .select('user_id', 'users.location', *features_array)
    .filter(col('predictions') == 5)
).show()

#### Visualise the cluster on the world map

Hint:
* convert the data with predictios to pandas dataframe
* use geopandas library for the plotting

In [None]:
local_clusters = predictions.select('users.location', *features_array, 'predictions').toPandas()

In [None]:
gdf = geopandas.GeoDataFrame(
    local_clusters, 
    geometry=geopandas.points_from_xy(local_clusters.longitude, local_clusters.latitude)
)

In [None]:
plt.rcParams["figure.figsize"] = (20,10)
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
ax = world.plot(color='white', edgecolor='black')

gdf[gdf['predictions'] == 0].plot(ax=ax, color='green')
gdf[gdf['predictions'] == 1].plot(ax=ax, color='blue')
gdf[gdf['predictions'] == 2].plot(ax=ax, color='red')
gdf[gdf['predictions'] == 3].plot(ax=ax, color='black')
gdf[gdf['predictions'] == 4].plot(ax=ax, color='yellow')
gdf[gdf['predictions'] == 5].plot(ax=ax, color='violet')

plt.show()

#### Save the result

Save the DataFrame with predictions to a table in postgresql databaze

Hint:
* use the `format` jdbc
* use append `mode`
* provide url, table_name, user, password in `option`
    * the url, user and password are the same as we used for reading

In [None]:
# save the result to the database:

table_name = 'user_clusters'

(
    predictions
    .select(
        'user_id', 
        col('predictions').alias('cluster_id')
    )
    .write
    .mode('append')
    .format('jdbc')
    .option('url', url)
    .option('dbtable', table_name)
    .option('user', user)
    .option('password', password)
    .save()
)

#### Save the model on disk

Persisting the model allows you to load it in your production application and use it on new data

Hint:
* use [write](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.clustering.KMeans.write) to create [MLWriter](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.util.MLWriter)
* then use [save](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.clustering.KMeans.save)

In [None]:
(
    model
    .write()
    .overwrite()
    .save(model_output_path)
)

#### Load the model again

Load the model from the path to test that it works.

* use API of [PipelineModel](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.PipelineModel)
    * [read](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.PipelineModel.read)
* apply the loaded model on our data and group by `predictions` to see it gives the same result as the original model

In [None]:
loaded_model = (
    PipelineModel
    .read()
    .load(model_output_path)
)

In [None]:
loaded_model.transform(data_with_coord).groupBy('predictions').count().show()

In [None]:
spark.stop()