In [0]:
from pyspark.sql import functions as F
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import ClusteringEvaluator

df = spark.read.format("delta").table("bank_gold")

print("Rows:", df.count())
print("Columns:", len(df.columns))

In [0]:
# Features for clustering based on mentor's requirement
# age group, balance tier, job, marital, housing, loan, prior contacts

cluster_features_cat = [
    "job", "marital", "housing", "loan", "age_group", 
    "balance_tier", "campaign_intensity"
]

cluster_features_num = [
    "was_previously_contacted", "previous", "campaign"
]

# Encode categoricals
indexers = [StringIndexer(inputCol=c, outputCol=f"{c}_idx", 
            handleInvalid="keep") for c in cluster_features_cat]

assembler = VectorAssembler(
    inputCols=[f"{c}_idx" for c in cluster_features_cat] + cluster_features_num,
    outputCol="cluster_features",
    handleInvalid="skip"
)

pipeline = Pipeline(stages=indexers + [assembler])
cluster_model = pipeline.fit(df)
df_cluster = cluster_model.transform(df)

print("Features ready for clustering")
df_cluster.select("cluster_features").show(3, truncate=True)

In [0]:
# Train KMeans with different k values and measure cost
# Lower cost = tighter clusters

costs = []

for k in range(2, 8):
    kmeans = KMeans(
        featuresCol="cluster_features",
        predictionCol="cluster",
        k=k,
        seed=42
    )
    model = kmeans.fit(df_cluster)
    cost = model.summary.trainingCost
    costs.append((k, round(cost, 2)))
    print(f"k={k} â†’ cost={round(cost, 2)}")



In [0]:
kmeans = KMeans(
    featuresCol="cluster_features",
    predictionCol="cluster",
    k=6,
    seed=42
)

print("Training KMeans with k=6...")
kmeans_model = kmeans.fit(df_cluster)
df_clustered = kmeans_model.transform(df_cluster)
print("Done!")

# Check cluster sizes
df_clustered.groupBy("cluster") \
    .count() \
    .orderBy("cluster") \
    .show()

In [0]:
df_clustered.groupBy("cluster") \
    .agg(
        F.round(F.avg("campaign"), 2).alias("avg_calls"),
        F.round(F.avg("previous"), 2).alias("avg_previous"),
        F.round(F.avg("was_previously_contacted"), 2).alias("pct_prev_contacted"),
        F.count("y").alias("total"),
        F.sum(F.when(F.col("y") == "yes", 1).otherwise(0)).alias("converted")
    ) \
    .withColumn("conversion_rate_%",
        F.round(F.col("converted") / F.col("total") * 100, 2)
    ) \
    .orderBy(F.desc("conversion_rate_%")) \
    .show()
    

In [0]:
# Add descriptive cluster names
df_clustered = df_clustered.withColumn("cluster_name",
    F.when(F.col("cluster") == 3, "Loyal Engaged")
     .when(F.col("cluster") == 0, "Warm Prospects")
     .when(F.col("cluster") == 2, "Average Customers")
     .when(F.col("cluster") == 4, "Passive Customers")
     .when(F.col("cluster") == 1, "Over-contacted")
     .otherwise("Wasted Spend")
)

# Save cluster labels to dim_customer
dim_customer_updated = df_clustered.select(
    "age", "age_group", "job", "marital", "education",
    "default", "balance", "balance_log", "balance_tier",
    "housing", "loan", "high_value_segment",
    "cluster", "cluster_name"
).withColumn("customer_id", F.monotonically_increasing_id())

dim_customer_updated.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable("bank_dim_customer")

print("dim_customer updated with cluster labels")
print("Rows:", spark.read.format("delta").table("bank_dim_customer").count())

spark.read.format("delta").table("bank_dim_customer") \
    .groupBy("cluster", "cluster_name") \
    .count() \
    .orderBy("cluster") \
    .show()