In [0]:
#loading bronze tables

bronze_schema = "workspace.stock_project"
silver_schema = "workspace.stock_project"

df_stock = spark.table(f"{bronze_schema}.bronze_stock_prices")
df_port = spark.table(f"{bronze_schema}.bronze_portfolio_transactions")
df_company = spark.table(f"{bronze_schema}.bronze_company_sector")
df_bench = spark.table(f"{bronze_schema}.bronze_benchmark_index")


**stock prices cleaning**

In [0]:
from pyspark.sql.functions import when, col

#standardize and type fixing 

df_stock1 = (
    df_stock
    .replace("NaN", None)
    .withColumn("open", col("open").cast("double"))
    .withColumn("high", col("high").cast("double"))
    .withColumn("low", col("low").cast("double"))
    .withColumn("close", col("close").cast("double"))
    .withColumn("volume", col("volume").cast("long"))
    .withColumn("date", col("date").cast("date"))
)


In [0]:
#validation rules
df_stock2 = (
    df_stock1
    .withColumn("rule_null_price",       col("open").isNull() | col("close").isNull())
    .withColumn("rule_null_volume",      col("volume").isNull())
    .withColumn("rule_null_high_low",    col("high").isNull() | col("low").isNull())
    .withColumn("rule_negative_volume",  col("volume") < 0)
    .withColumn("rule_zero_close",       col("close") == 0)
    .withColumn("rule_close_outside",   (col("close") < col("low")) | (col("close") > col("high")))
)


In [0]:
#invalid rows
df_stock_invalid = df_stock2.filter(
    "rule_null_price OR rule_null_volume OR rule_negative_volume OR rule_zero_close OR rule_close_outside OR rule_null_high_low"
)

#save
df_stock_invalid.write.mode("overwrite").format("delta").saveAsTable(f"{silver_schema}.silver_invalid_stock_prices")


In [0]:
#clean table

df_stock_clean=(
    df_stock2
    .filter(~(
        col("rule_null_price") | 
        col("rule_null_volume") | 
        col("rule_negative_volume") | 
        col("rule_zero_close") | 
        col("rule_close_outside")|
        col("rule_null_high_low")
    ))
    .withColumn("fixed_high", when(col("high")<col("low"),col("low")).otherwise(col("high")))
    .withColumn("fixed_low", when(col("high")<col("low"),col("high")).otherwise(col("low")))
    .drop("high","low")
    .withColumnRenamed("fixed_high","high")
    .withColumnRenamed("fixed_low","low")
    .dropDuplicates()
    .drop(
    "rule_null_price",
    "rule_null_volume",
    "rule_negative_volume",
    "rule_zero_close",
    "rule_null_high_low",
    "rule_close_outside"
)

)

#save
df_stock_clean.write.mode("overwrite").partitionBy("Ticker").format("delta").saveAsTable(f"{silver_schema}.silver_stock_prices")


**company sector cleaning**

In [0]:
#fix misspellings
df_company1 = (
    df_company
    .replace({"Technlogy": "Technology"})
    .withColumn("sector", when(col("sector").isNull(), "Other").otherwise(col("sector")))
)


In [0]:
#flag extra tickers not in stock dataset
valid_tickers = df_stock_clean.select("ticker").distinct()

df_company_invalid = df_company1.join(valid_tickers, "ticker", "left_anti")
df_company_valid = df_company1.join(valid_tickers, "ticker", "inner")


In [0]:
#save
df_company_invalid.write.mode("overwrite").format("delta").saveAsTable(f"{silver_schema}.silver_invalid_company_sector")
df_company_valid.write.mode("overwrite").format("delta").saveAsTable(f"{silver_schema}.silver_company_sector")


**portfolio transactions cleaning**

In [0]:
#standardize
df_port1 = (
    df_port
    .replace("NaN", None)
    .withColumn("price", col("price").cast("double"))
    .withColumn("quantity", col("quantity").cast("int"))
    .withColumn("date", col("date").cast("date"))
)


In [0]:
from pyspark.sql.functions import when, col

#fix misspellings
df_port2 = (
    df_port1
    .withColumn(
        "action",
        when(col("action") == "BUYY", "BUY")
        .otherwise(col("action"))
    )
)


In [0]:
#identify invalid rows
from pyspark.sql.functions import current_date

invalid_condition = (
    (col("price").isNull()) |
    (col("quantity").isNull()) |
    (col("quantity") <= 0) |
    (col("price") <= 0) |
    (~col("action").isin("BUY", "SELL")) |
    (col("date") > current_date())
)

df_port_invalid = df_port2.filter(invalid_condition)

#save
df_port_invalid.write.mode("overwrite").format("delta").saveAsTable(f"{silver_schema}.silver_invalid_portfolio")


In [0]:
#valid rows
df_port_clean = df_port2.filter(~invalid_condition)

#save
df_port_clean.write.mode("overwrite").format("delta").saveAsTable(f"{silver_schema}.silver_portfolio")


**benchmark index cleaning**

In [0]:
#standardize
df_bench1 = (
    df_bench
    .replace("NaN", None)
    .withColumn("date", col("date").cast("date"))
    .withColumn("close", col("close").cast("double"))
)


In [0]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import col, when

df_bench1_keyed = df_bench1.withColumn("key", F.lit(1))

w = Window.partitionBy("key").orderBy("date")

df_bench2 = (
    df_bench1_keyed
    .withColumn("prev_close", F.lag("close").over(w))
    .withColumn(
        "daily_return",
        (col("close") - col("prev_close")) / col("prev_close")
    )
    .withColumn(
        "extreme_return_flag",
        when(col("daily_return") < -0.20, True).otherwise(False)
    )
    .withColumn(
        "invalid_open_flag",
        when(col("open").isNull() | (col("open") <= 0), True).otherwise(False)
    )
    .withColumn(
        "invalid_close_flag",
        when(col("close").isNull() | (col("close") <= 0), True).otherwise(False)
    )
)

df_bench_invalid = df_bench2.filter(
    (col("invalid_open_flag") == True) |
    (col("invalid_close_flag") == True) |
    (col("daily_return").isNull())
)

df_bench_invalid.write \
    .mode("overwrite") \
    .format("delta") \
    .saveAsTable(f"{silver_schema}.silver_invalid_benchmark")

In [0]:

# Valid rows
df_bench_clean = df_bench2.filter(
    (col("invalid_open_flag") == False) &
    (col("invalid_close_flag") == False) 
).drop("prev_close","key")

df_bench_clean.write.mode("overwrite").format("delta").saveAsTable(
    f"{silver_schema}.silver_benchmark_index"
)

**Silver data Quality Summary Log**

In [0]:
def dq(df, name):
    return spark.createDataFrame([
        (
            name,
            df.count(),
            df.dropDuplicates().count(),
            sum(df.filter(col(c).isNull()).count() for c in df.columns)
        )
    ], ["table", "row_count", "unique_rows", "null_count"])

dq_log = (
    dq(df_stock_clean, "silver_stock_prices")
    .union(dq(df_port_clean, "silver_portfolio"))
    .union(dq(df_company_valid, "silver_company_sector"))
    .union(dq(df_bench_clean, "silver_benchmark_index"))
)
display(dq_log)
dq_log.write.mode("overwrite").format("delta").saveAsTable(f"{silver_schema}.silver_dq_log")


table,row_count,unique_rows,null_count
silver_stock_prices,62640,62640,0
silver_portfolio,29796,29796,0
silver_company_sector,30,30,0
silver_benchmark_index,2025,2025,22


In [0]:
dbutils.notebook.exit("SUCCESS")
