<a href="https://colab.research.google.com/github/dansarmiento/machine_learning_notebooks/blob/main/SparkML_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark==3.5 -q
!pip install findspark -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

# FindSpark simplifies the process of using Apache Spark with Python

import findspark
findspark.init()

#import functions/Classes for sparkml

from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

from pyspark.sql import SparkSession

In [3]:
#Create SparkSession
#Ignore any warnings by SparkSession command

spark = SparkSession.builder.appName("Clustering using SparkML").getOrCreate()

In [4]:
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0231EN-SkillsNetwork/datasets/customers.csv

--2025-05-24 22:46:08--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0231EN-SkillsNetwork/datasets/customers.csv
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.45.118.108
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.45.118.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8909 (8.7K) [text/csv]
Saving to: ‘customers.csv’


2025-05-24 22:46:09 (181 MB/s) - ‘customers.csv’ saved [8909/8909]



In [5]:
# using the spark.read.csv function we load the data into a dataframe.
# the header = True mentions that there is a header row in out csv file
# the inferSchema = True, tells spark to automatically find out the data types of the columns.

# Load customers dataset
customer_data = spark.read.csv("customers.csv", header=True, inferSchema=True)

In [6]:
customer_data.printSchema()

root
 |-- Fresh_Food: integer (nullable = true)
 |-- Milk: integer (nullable = true)
 |-- Grocery: integer (nullable = true)
 |-- Frozen_Food: integer (nullable = true)



In [7]:
# Assemble the features into a single vector column
feature_cols = ['Fresh_Food', 'Milk', 'Grocery', 'Frozen_Food']
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
customer_transformed_data = assembler.transform(customer_data)

In [8]:
# train the model
number_of_clusters = 3
kmeans = KMeans(k = number_of_clusters)
model = kmeans.fit(customer_transformed_data)

In [9]:
# Make predictions on the dataset
predictions = model.transform(customer_transformed_data)

In [10]:
#stop spark session
spark.stop()

In [11]:
spark = SparkSession.builder.appName("Seed Clustering").getOrCreate()

In [12]:
#download seed dataset
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0231EN-SkillsNetwork/datasets/seeds.csv

--2025-05-24 22:46:37--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0231EN-SkillsNetwork/datasets/seeds.csv
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.45.118.108
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.45.118.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8973 (8.8K) [text/csv]
Saving to: ‘seeds.csv’


2025-05-24 22:46:38 (112 MB/s) - ‘seeds.csv’ saved [8973/8973]



In [13]:
seed_data = spark.read.csv("seeds.csv", header=True, inferSchema=True)

In [14]:
seed_data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length of kernel: double (nullable = true)
 |-- width of kernel: double (nullable = true)
 |-- asymmetry coefficient: double (nullable = true)
 |-- length of kernel groove: double (nullable = true)



In [15]:
seed_data.show(n=5, truncate=False, vertical=True)

-RECORD 0-------------------------
 area                    | 15.26  
 perimeter               | 14.84  
 compactness             | 0.871  
 length of kernel        | 5.763  
 width of kernel         | 3.312  
 asymmetry coefficient   | 2.221  
 length of kernel groove | 5.22   
-RECORD 1-------------------------
 area                    | 14.88  
 perimeter               | 14.57  
 compactness             | 0.8811 
 length of kernel        | 5.554  
 width of kernel         | 3.333  
 asymmetry coefficient   | 1.018  
 length of kernel groove | 4.956  
-RECORD 2-------------------------
 area                    | 14.29  
 perimeter               | 14.09  
 compactness             | 0.905  
 length of kernel        | 5.291  
 width of kernel         | 3.337  
 asymmetry coefficient   | 2.699  
 length of kernel groove | 4.825  
-RECORD 3-------------------------
 area                    | 13.84  
 perimeter               | 13.94  
 compactness             | 0.8955 
 length of kernel   

In [16]:
feature_cols = ['area',
 'perimeter',
 'compactness',
 'length of kernel',
 'width of kernel',
 'asymmetry coefficient',
 'length of kernel groove']

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
seed_transformed_data = assembler.transform(seed_data)

In [17]:
number_of_clusters = 3
kmeans = KMeans(k = number_of_clusters)
model = kmeans.fit(seed_transformed_data)

In [18]:
predictions = model.transform(seed_transformed_data)

In [19]:
predictions.show(n=5, truncate=False, vertical=True)

-RECORD 0---------------------------------------------------------------
 area                    | 15.26                                        
 perimeter               | 14.84                                        
 compactness             | 0.871                                        
 length of kernel        | 5.763                                        
 width of kernel         | 3.312                                        
 asymmetry coefficient   | 2.221                                        
 length of kernel groove | 5.22                                         
 features                | [15.26,14.84,0.871,5.763,3.312,2.221,5.22]   
 prediction              | 0                                            
-RECORD 1---------------------------------------------------------------
 area                    | 14.88                                        
 perimeter               | 14.57                                        
 compactness             | 0.8811                  

In [20]:
predictions.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   61|
|         2|   77|
|         0|   72|
+----------+-----+



In [21]:
#stop spark session
spark.stop()