# Clustering with SparkML

# Setup dependencies
I will be using pandas and sklearn for managing data and machine learning.
<details>
    <summary>pip install...</summary>

```python
# Allows to install a python package
pip install package-name
# or install python package with a specific version
pip install package-name==version
```
</details>


In [1]:
# You can use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

# Importing required libraries

In [2]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

from pyspark.sql import SparkSession

# Creating Spark Session

In [3]:
spark = SparkSession.builder.appName("Clustering using SparkML").getOrCreate()

# Reading CSV Dataset

In [4]:
customer_data = spark.read.csv("data/customers.csv", header=True, inferSchema=True)

In [6]:
customer_data.printSchema()

root
 |-- Fresh_Food: integer (nullable = true)
 |-- Milk: integer (nullable = true)
 |-- Grocery: integer (nullable = true)
 |-- Frozen_Food: integer (nullable = true)



In [7]:
customer_data.show(n=5, truncate=False)

+----------+----+-------+-----------+
|Fresh_Food|Milk|Grocery|Frozen_Food|
+----------+----+-------+-----------+
|12669     |9656|7561   |214        |
|7057      |9810|9568   |1762       |
|6353      |8808|7684   |2405       |
|13265     |1196|4221   |6404       |
|22615     |5410|7198   |3915       |
+----------+----+-------+-----------+
only showing top 5 rows



## Task 3 - Create a feature vector

In [8]:
# Assemble the features into a single vector column
feature_cols = ['Fresh_Food', 'Milk', 'Grocery', 'Frozen_Food']
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
customer_transformed_data = assembler.transform(customer_data)

## Task 4 - Create a clustering model

In [9]:
NUMBER_OF_CLUSTERS = 3

In [10]:
kmeans = KMeans(k = NUMBER_OF_CLUSTERS)
model = kmeans.fit(customer_transformed_data)

## Task 5 - Print Cluster Details

In [11]:
# Make predictions on the dataset
predictions = model.transform(customer_transformed_data)

# Display the results
predictions.show(5)

predictions.groupBy('prediction').count().show()

+----------+----+-------+-----------+--------------------+----------+
|Fresh_Food|Milk|Grocery|Frozen_Food|            features|prediction|
+----------+----+-------+-----------+--------------------+----------+
|     12669|9656|   7561|        214|[12669.0,9656.0,7...|         1|
|      7057|9810|   9568|       1762|[7057.0,9810.0,95...|         1|
|      6353|8808|   7684|       2405|[6353.0,8808.0,76...|         1|
|     13265|1196|   4221|       6404|[13265.0,1196.0,4...|         1|
|     22615|5410|   7198|       3915|[22615.0,5410.0,7...|         2|
+----------+----+-------+-----------+--------------------+----------+
only showing top 5 rows

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  331|
|         2|   60|
|         0|   49|
+----------+-----+



# Stop Spark Session

In [None]:
spark.stop()