In [0]:
from pyspark.sql import functions as F

df=spark.read.format("delta").table("bank_silver")

print(f"Rows are {df.count()}")
print(f"Columns are {len(df.columns)}")

In [0]:
df.printSchema()


In [0]:
df.groupBy("was_previously_contacted").count().show()

In [0]:
df.groupBy("job") \
  .agg(
      F.count("y").alias("total_customers"),
      F.sum(F.when(F.col("y") == "yes", 1).otherwise(0)).alias("converted"),
  ) \
  .withColumn("conversion_rate_%", F.round(F.col("converted") / F.col("total_customers") * 100, 2)) \
  .orderBy(F.desc("conversion_rate_%")) \
  .show()
  

In [0]:
df.withColumn("age_group",
    F.when(F.col("age") <= 35, "Young (18-35)")
     .when(F.col("age") <= 55, "Mid (36-55)")
     .otherwise("Senior (55+)")
) \
  .groupBy("age_group") \
  .agg(
      F.count("y").alias("total_customers"),
      F.sum(F.when(F.col("y") == "yes", 1).otherwise(0)).alias("converted")
  ) \
  .withColumn("conversion_rate_%", F.round(F.col("converted") / F.col("total_customers") * 100, 2)) \
  .orderBy(F.desc("conversion_rate_%")) \
  .show()

In [0]:
df.groupBy("marital") \
  .agg(
      F.count("y").alias("total_customers"),
      F.sum(F.when(F.col("y") == "yes", 1).otherwise(0)).alias("converted")
  ) \
  .withColumn("conversion_rate_%", F.round(F.col("converted") / F.col("total_customers") * 100, 2)) \
  .orderBy(F.desc("conversion_rate_%")) \
  .show()

In [0]:
df.groupBy("education") \
  .agg(
      F.count("y").alias("total_customers"),
      F.sum(F.when(F.col("y") == "yes", 1).otherwise(0)).alias("converted")
  ) \
  .withColumn("conversion_rate_%", F.round(F.col("converted") / F.col("total_customers") * 100, 2)) \
  .orderBy(F.desc("conversion_rate_%")) \
  .show()

In [0]:
df.withColumn("balance_tier",
    F.when(F.col("balance") < 0, "Negative")
     .when(F.col("balance") == 0, "Zero")
     .when(F.col("balance") <= 5000, "Medium")
     .otherwise("High")
) \
  .groupBy("balance_tier") \
  .agg(
      F.count("y").alias("total_customers"),
      F.sum(F.when(F.col("y") == "yes", 1).otherwise(0)).alias("converted")
  ) \
  .withColumn("conversion_rate_%", F.round(F.col("converted") / F.col("total_customers") * 100, 2)) \
  .orderBy(F.desc("conversion_rate_%")) \
  .show()

In [0]:
df.groupBy("month") \
  .agg(
      F.count("y").alias("total_customers"),
      F.sum(F.when(F.col("y") == "yes", 1).otherwise(0)).alias("converted")
  ) \
  .withColumn("conversion_rate_%", F.round(F.col("converted") / F.col("total_customers") * 100, 2)) \
  .orderBy(F.desc("conversion_rate_%")) \
  .show()
  

In [0]:
df.groupBy("contact") \
  .agg(
      F.count("y").alias("total_customers"),
      F.sum(F.when(F.col("y") == "yes", 1).otherwise(0)).alias("converted")
  ) \
  .withColumn("conversion_rate_%", F.round(F.col("converted") / F.col("total_customers") * 100, 2)) \
  .orderBy(F.desc("conversion_rate_%")) \
  .show()

In [0]:
df.groupBy("campaign") \
  .agg(
      F.count("y").alias("total_customers"),
      F.sum(F.when(F.col("y") == "yes", 1).otherwise(0)).alias("converted")
  ) \
  .withColumn("conversion_rate_%", F.round(F.col("converted") / F.col("total_customers") * 100, 2)) \
  .orderBy("campaign") \
  .show()

In [0]:
# How many calls are being wasted beyond 3 attempts?
wasted = df.filter(F.col("campaign") > 3)
total_wasted = wasted.count()
pct_wasted = round(total_wasted / df.count() * 100, 2)
conversions_from_wasted = wasted.filter(F.col("y") == "yes").count()

print(f"Calls beyond 3 attempts : {total_wasted} ({pct_wasted}%)")
print(f"Conversions from those  : {conversions_from_wasted}")
print(f"Conversion rate         : {round(conversions_from_wasted/total_wasted*100,2)}%")


In [0]:
df.groupBy("poutcome") \
  .agg(
      F.count("y").alias("total_customers"),
      F.sum(F.when(F.col("y") == "yes", 1).otherwise(0)).alias("converted")
  ) \
  .withColumn("conversion_rate_%", F.round(F.col("converted") / F.col("total_customers") * 100, 2)) \
  .orderBy(F.desc("conversion_rate_%")) \
  .show()

In [0]:
df.groupBy("was_previously_contacted") \
  .agg(
      F.count("y").alias("total_customers"),
      F.sum(F.when(F.col("y") == "yes", 1).otherwise(0)).alias("converted")
  ) \
  .withColumn("conversion_rate_%", F.round(F.col("converted") / F.col("total_customers") * 100, 2)) \
  .orderBy(F.desc("conversion_rate_%")) \
  .show()

In [0]:
df.groupBy("previous") \
  .agg(
      F.count("y").alias("total_customers"),
      F.sum(F.when(F.col("y") == "yes", 1).otherwise(0)).alias("converted")
  ) \
  .withColumn("conversion_rate_%", F.round(F.col("converted") / F.col("total_customers") * 100, 2)) \
  .orderBy("previous") \
  .show(15)
  

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation

# Convert y to numeric first
df_corr = df.withColumn("y_numeric", F.when(F.col("y") == "yes", 1).otherwise(0))

# Select numeric columns
num_cols = ["age", "balance", "duration", "campaign", "pdays", "previous", "y_numeric"]

assembler = VectorAssembler(inputCols=num_cols, outputCol="features")
df_vector = assembler.transform(df_corr).select("features")

corr_matrix = Correlation.corr(df_vector, "features").head()[0]

# Print correlations with y
print("Correlation with target variable (y):")
print("-" * 40)
for i, col in enumerate(num_cols[:-1]):
    corr_value = round(corr_matrix[i, len(num_cols)-1], 4)
    print(f"  {col:12} : {corr_value}")

In [0]:
df.withColumn("balance_tier",
    F.when(F.col("balance") < 0, "Negative")
     .when(F.col("balance") == 0, "Zero")
     .when(F.col("balance") <= 5000, "Medium")
     .otherwise("High")
) \
  .filter(F.col("balance_tier") == "High") \
  .groupBy("job") \
  .agg(
      F.count("y").alias("total_customers"),
      F.sum(F.when(F.col("y") == "yes", 1).otherwise(0)).alias("converted")
  ) \
  .withColumn("conversion_rate_%", F.round(F.col("converted") / F.col("total_customers") * 100, 2)) \
  .orderBy(F.desc("conversion_rate_%")) \
  .show()

In [0]:

findings = [
    ("Who to target", "Students 28.7%, Retired 22.8% conversion"),
    ("Best months", "Mar 52%, Sep 46%, Oct 44%, Dec 47%"),
    ("Worst month", "May 6.7% gets most calls — biggest waste"),
    ("Channel", "Cellular 14.9%, drop unknown contacts 4%"),
    ("Call limit", "Stop after 3 calls — saves ~252K annually"),
    ("Best leads", "Previous success 64.7% conversion rate"),
    ("Prior contact", "Previously contacted 23% vs 9% never contacted"),
    ("Correlation", "Duration strongest predictor — leakage risk"),
]

findings_df = spark.createDataFrame(findings, ["finding", "insight"])
findings_df.write.format("delta").mode("overwrite").saveAsTable("bank_eda_findings")
print("EDA findings saved")

In [0]:
df.withColumn("age_group",
    F.when(F.col("age") <= 35, "Young")
     .when(F.col("age") <= 55, "Mid")
     .otherwise("Senior")
) \
  .withColumn("balance_tier",
    F.when(F.col("balance") < 0, "Negative")
     .when(F.col("balance") == 0, "Zero")
     .when(F.col("balance") <= 5000, "Medium")
     .otherwise("High")
) \
  .write.format("delta") \
  .mode("overwrite") \
  .option("mergeSchema", "true") \
  .saveAsTable("bank_silver")

print("Silver table updated with EDA columns")