In [0]:
df=spark.read.csv("/Volumes/workspace/default/bankingdata/bank-full.csv", header=True, inferSchema=True, sep=";")
df.printSchema()
print(f"no of rows is {df.count()}");
print(f"No of columns is {len(df.columns)}")
display(df)
df.show()

In [0]:
pandas_df = df.toPandas()

In [0]:

df.describe().show()

In [0]:
df.describe().toPandas()

In [0]:
df.show()


In [0]:
df.dtypes

In [0]:
df.groupBy("y").count().show()

In [0]:
from pyspark.sql import functions as F

df.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c) 
    for c in df.columns
]).show()

In [0]:
df.createOrReplaceTempView("bank_data")

In [0]:
spark.sql("""
    SELECT count(*)
    FROM bank_data
    WHERE job = 'unknown'
       OR marital = 'unknown'
       OR education = 'unknown'
       OR default = 'unknown'
       OR housing = 'unknown'
       OR loan = 'unknown'
       OR contact = 'unknown'
       OR poutcome = 'unknown'
""").show()

In [0]:
unknown_cols = ["job", "education", "contact", "poutcome"]

for col in unknown_cols:
    count = df.filter(F.col(col) == "unknown").count()
    pct = round(count / df.count() * 100, 2)
    print(f"{col}: {count} unknowns ({pct}%)")

In [0]:
for col in ["job", "education"]:
    mode = (df
        .filter(F.col(col) != "unknown")
        .groupBy(col)
        .count()
        .orderBy(F.desc("count"))
        .first()[0]
    )
    print(f"{col} mode: {mode}")

In [0]:
df = df.withColumn("job", 
    F.when(F.col("job") == "unknown", "blue-collar").otherwise(F.col("job"))
)

df = df.withColumn("education",
    F.when(F.col("education") == "unknown", "secondary").otherwise(F.col("education"))
)

# Verify it worked
for col in ["job", "education"]:
    count = df.filter(F.col(col) == "unknown").count()
    print(f"{col} unknowns remaining: {count}")

In [0]:
df.select("balance", "campaign", "pdays", "duration", "previous").describe().toPandas()


In [0]:
df = df.withColumn("was_previously_contacted",
    F.when(F.col("pdays") != -1, 1).otherwise(0)
)
df.groupBy("was_previously_contacted").count().show()

In [0]:
df = df.withColumn("pdays",
    F.when(F.col("pdays") == -1, 0).otherwise(F.col("pdays"))
)
df.select(F.min("pdays")).show()

In [0]:
df.groupBy("campaign").count().orderBy("campaign").show(100)
df.select("balance").summary("min","25%","50%","75%","max").show()
df.select("campaign").summary("min","25%","50%","75%","max").show()


In [0]:
from pyspark.sql import functions as F

# Find minimum balance
min_balance = df.agg(F.min("balance")).collect()[0][0]

# Shift and log-transform
df = df.withColumn("balance_log", F.log1p(F.col("balance") - min_balance + 1))

df.select("balance", "balance_log").describe().show()

In [0]:
print("=" * 50)
print("DATA QUALITY REPORT")
print("=" * 50)

# 1. ROW COUNT
total = df.count()
print(f"\n1. TOTAL ROWS: {total}")

# 2. DUPLICATES
duplicates = df.count() - df.dropDuplicates().count()
print(f"\n2. DUPLICATE ROWS: {duplicates}")

# 3. NULLS PER COLUMN
print("\n3. NULL VALUES:")
for col in df.columns:
    nulls = df.filter(F.col(col).isNull()).count()
    if nulls > 0:
        print(f"   {col}: {nulls} nulls")
    else:
        print(f"   {col}:  clean")

# 4. UNKNOWNS PER COLUMN
print("\n4. UNKNOWN VALUES:")
for col in ["job", "education", "contact", "poutcome"]:
    count = df.filter(F.col(col) == "unknown").count()
    pct = round(count / total * 100, 2)
    print(f"   {col}: {count} ({pct}%)")

# 5. OUTLIER CHECK
print("\n5. OUTLIER CHECK:")
for col in ["balance", "campaign", "duration", "age"]:
    p1, p99 = df.approxQuantile(col, [0.01, 0.99], 0.001)
    outliers = df.filter((F.col(col) < p1) | (F.col(col) > p99)).count()
    pct = round(outliers / total * 100, 2)
    print(f"   {col}: {outliers} outliers ({pct}%) outside 1st-99th percentile")

# 6. SCHEMA VALIDATION
print("\n6. SCHEMA VALIDATION:")
expected = {
    "age": "int", "balance": "int", "day": "int",
    "duration": "int", "campaign": "int", "pdays": "int",
    "previous": "int", "job": "string", "marital": "string",
    "education": "string", "contact": "string", "poutcome": "string",
    "y": "string"
}
for col, expected_type in expected.items():
    actual_type = dict(df.dtypes)[col]
    status = "Correct" if actual_type == expected_type else "MISMATCH"
    print(f"   {col}: expected {expected_type} â†’ got {actual_type} {status}")

# 7. CLASS BALANCE
print("\n7. TARGET VARIABLE BALANCE:")
df.groupBy("y").count().withColumn(
    "pct", F.round(F.col("count") / total * 100, 2)
).orderBy("y").show()

print("=" * 50)
print("QUALITY CHECK COMPLETE")
print("=" * 50)

In [0]:
df.write.format("delta").mode("overwrite").saveAsTable("bank_silver")