In [0]:
from pyspark.sql import functions as F

#Volume base path
volume_path = "/Volumes/workspace/stock_project/raw_files/"

#file paths 
stock_file = volume_path + "stock_prices__.csv"
portfolio_file = volume_path + "portfolio_transactions__.csv"
company_file = volume_path + "company_sector_.csv"
benchmark_file = volume_path + "benchmark_index__.csv"

#reading raw csv files into dataframes
df_stock_raw = spark.read.option("header", True).option("inferSchema", True).csv(stock_file)
df_portfolio_raw = spark.read.option("header", True).option("inferSchema", True).csv(portfolio_file)
df_company_raw = spark.read.option("header", True).option("inferSchema", True).csv(company_file)
df_benchmark_raw = spark.read.option("header", True).option("inferSchema", True).csv(benchmark_file)

#display
display(df_stock_raw.limit(10))
display(df_portfolio_raw.limit(10))
display(df_company_raw.limit(10))
display(df_benchmark_raw.limit(10))



date,ticker,open,high,low,close,volume
2021-01-01,AAPL,141.77,144.59,141.34,141.91,4852219.0
2021-01-02,AAPL,146.01,148.04,144.67,146.34,1232267.0
2021-01-03,AAPL,143.79,143.88,140.75,142.75,3742555.0
2021-01-04,AAPL,141.6,142.98,140.16,142.35,450325.0
2021-01-05,AAPL,146.26,146.95,142.91,144.26,2330027.0
2021-01-06,AAPL,146.2,149.05,142.72,143.06,4916393.0
2021-01-07,AAPL,146.45,150.61,145.35,149.37,4811653.0
2021-01-08,AAPL,142.14,143.8,142.01,142.09,3847122.0
2021-01-09,AAPL,142.13,142.87,140.0,141.36,4887975.0
2021-01-10,AAPL,145.18,,142.3,143.93,4495580.0


transaction_id,date,ticker,action,quantity,price
1,2022-09-17,PFE,SELL,294.0,223.2
2,2022-02-25,AMZN,SELL,168.0,284.82
3,2021-04-23,TGT,SELL,463.0,459.46
4,2021-02-16,PEP,BUY,482.0,70.69
5,2021-07-01,BAC,SELL,344.0,310.09
6,2022-09-21,INTC,SELL,365.0,259.81
7,2021-09-21,META,BUYY,179.0,265.47
8,2022-03-16,AVGO,SELL,259.0,373.96
9,2021-05-10,AVGO,SELL,486.0,157.84
10,2022-08-31,PYPL,BUY,-89.0,172.67


ticker,sector
AAPL,Automotive
MSFT,Automotive
AMZN,Finance
GOOGL,Healthcare
META,Technlogy
NFLX,Retail
NVDA,Semiconductors
TSLA,
JPM,Technology
BAC,Healthcare


date,open,close
2021-01-01,3512.09,3510.49
2021-01-02,3506.96,3517.56
2021-01-03,3534.74,3538.83
2021-01-04,3549.02,3549.73
2021-01-05,3565.07,3571.81
2021-01-06,3576.28,3590.71
2021-01-07,3590.38,3585.18
2021-01-08,3583.97,3574.68
2021-01-09,3585.25,3570.52
2021-01-10,3589.56,3580.25


**STOCK PRICES**

In [0]:
# Count nulls in key columns of stock prices
from pyspark.sql.functions import col

print("STOCK PRICES — NULL COUNTS ")

null_counts = {
    "null_open": df_stock_raw.filter(col("open").isNull()).count(),
    "null_high": df_stock_raw.filter(col("high").isNull()).count(),
    "null_low": df_stock_raw.filter(col("low").isNull()).count(),
    "null_close": df_stock_raw.filter(col("close").isNull()).count(),
    "null_volume": df_stock_raw.filter(col("volume").isNull()).count(),
    "null_date": df_stock_raw.filter(col("date").isNull()).count()
}

df_null_summary = spark.createDataFrame(
    [(k, v) for k, v in null_counts.items()],
    ["null_type", "count"]
)

display(df_null_summary)

STOCK PRICES — NULL COUNTS 


null_type,count
null_open,24
null_high,45
null_low,39
null_close,30
null_volume,120
null_date,0


In [0]:
print("STOCK PRICES — INVALID VALUE COUNTS")

invalid_counts = {
    # textual invalids
    "invalid_open_text": df_stock_raw.filter(col("open").cast("string") == "NaN").count(),
    "invalid_close_text": df_stock_raw.filter(col("close").cast("string") == "NaN").count(),
    "invalid_high_text": df_stock_raw.filter(col("high").cast("string") == "NaN").count(),
    "invalid_low_text": df_stock_raw.filter(col("low").cast("string") == "NaN").count(),
    "invalid_volume_text": df_stock_raw.filter(col("volume").cast("string") == "NaN").count(),

    # business rule violations
    "negative_volume": df_stock_raw.filter(col("volume") < 0).count(),
    "zero_close": df_stock_raw.filter(col("close") == 0).count(),
    "negative_close": df_stock_raw.filter(col("close") < 0).count(),
    "zero_volume": df_stock_raw.filter(col("volume") == 0).count(),
    "zero_open": df_stock_raw.filter(col("open") == 0).count() ,
    "negative_open": df_stock_raw.filter(col("open") < 0).count(),
    "high_low_inconsistent": df_stock_raw.filter(col("high") < col("low")).count(),
    "close_outside_range": df_stock_raw.filter((col("close") < col("low")) | (col("close") > col("high"))).count()
}

df_invalid_summary = spark.createDataFrame(
    [(k, v) for k, v in invalid_counts.items()],
    ["invalid_type", "count"]
)

display(df_invalid_summary)

STOCK PRICES — INVALID VALUE COUNTS


invalid_type,count
invalid_open_text,0
invalid_close_text,0
invalid_high_text,0
invalid_low_text,0
invalid_volume_text,0
negative_volume,27
zero_close,21
negative_close,0
zero_volume,0
zero_open,0


**COMPANY SECTOR**

In [0]:
# Count nulls in the sector column of company sector table
print("COMPANY SECTOR - NULL COUNT")

df_null_sector_count = df_company_raw.filter(col("sector").isNull()).count()
display(spark.createDataFrame([("null_sector", df_null_sector_count)], ["issue", "count"]))

COMPANY SECTOR - NULL COUNT


issue,count
null_sector,3


In [0]:
# Find tickers present in company table but missing from stock prices
print("INVALID TICKERS — Present in Company Table but Not in Stock Prices")

valid_tickers = df_stock_raw.select("ticker").distinct()  # All tickers in stock prices

df_invalid_tickers = (
    df_company_raw
    .join(valid_tickers, "ticker", "left_anti")  # Company tickers not found in stock prices
)

display(df_invalid_tickers)

INVALID TICKERS — Present in Company Table but Not in Stock Prices


ticker,sector
FAKE1,Technlogy


In [0]:
# Count tickers present in company table but missing from stock prices
df_invalid_ticker_count = df_invalid_tickers.count()

# Display the count as a DataFrame for reporting
display(spark.createDataFrame(
    [("invalid_ticker_count", df_invalid_ticker_count)],
    ["issue", "count"]
))

issue,count
invalid_ticker_count,1


**PORTFOLIO TRANSACTIONS**

In [0]:
from pyspark.sql.functions import col

# Count nulls in key columns of the portfolio transactions table
print("PORTFOLIO TRANSACTIONS — NULL COUNTS")

null_counts_portfolio = {
    "null_transaction_id": df_portfolio_raw.filter(col("transaction_id").isNull()).count(),  # Null transaction IDs
    "null_action": df_portfolio_raw.filter(col("action").isNull()).count(),                 # Null action values
    "null_quantity": df_portfolio_raw.filter(col("quantity").isNull()).count(),             # Null quantity values
    "null_price": df_portfolio_raw.filter(col("price").isNull()).count(),                   # Null price values
    "null_date": df_portfolio_raw.filter(col("date").isNull()).count(),                     # Null date values
    "null_ticker": df_portfolio_raw.filter(col("ticker").isNull()).count()                  # Null ticker values
}

# Create a summary DataFrame of null counts for reporting
df_portfolio_null_summary = spark.createDataFrame(
    [(k, v) for k, v in null_counts_portfolio.items()],
    ["null_type", "count"]
)

display(df_portfolio_null_summary)

PORTFOLIO TRANSACTIONS — NULL COUNTS


null_type,count
null_transaction_id,0
null_action,0
null_quantity,93
null_price,9
null_date,0
null_ticker,0


In [0]:
from pyspark.sql.functions import current_date

# Count invalid values in portfolio transactions table
invalid_counts_portfolio = {
    # Actions not matching expected values
    "invalid_action": df_portfolio_raw.filter(~col("action").isin("BUY", "SELL")).count(),
    # Negative quantity values
    "negative_quantity": df_portfolio_raw.filter(col("quantity") < 0).count(),
    # Zero quantity values
    "zero_quantity": df_portfolio_raw.filter(col("quantity") == 0).count(),
    # Negative price values
    "negative_price": df_portfolio_raw.filter(col("price") < 0).count(),
    # Zero price values
    "zero_price": df_portfolio_raw.filter(col("price") == 0).count(),
    # Dates in the future
    "future_date": df_portfolio_raw.filter(col("date") > current_date()).count()
}

# Create summary DataFrame of invalid value counts
df_portfolio_invalid_summary = spark.createDataFrame(
    [(k, v) for k, v in invalid_counts_portfolio.items()],
    ["invalid_type", "count"]
)

display(df_portfolio_invalid_summary)

PORTFOLIO TRANSACTIONS — INVALID VALUE COUNTS


invalid_type,count
invalid_action,1278
negative_quantity,57
zero_quantity,15
negative_price,15
zero_price,15
future_date,0


In [0]:
# Display distinct misspellings in the 'action' column of portfolio transactions
print("DISTINCT MISSPELLINGS PRESENT IN ACTION COLUMN")

df_invalid_actions = df_portfolio_raw.filter(~col("action").isin("BUY", "SELL"))  # Filter actions not matching expected values

df_distinct_misspellings = df_invalid_actions.select("action").distinct()  # Get unique invalid action values

display(df_distinct_misspellings)

DISTINCT MISSPELLINGS PRESENT IN ACTION COLUMN


action
BUYY


**BENCHMARK INDEX**

In [0]:
from pyspark.sql.functions import col

# Count nulls in key columns of the benchmark index table
print("BENCHMARK INDEX — NULL COUNT")

null_counts_benchmark = {
    "null_close": df_benchmark_raw.filter(col("close").isNull()).count(),  # Null close prices
    "null_date": df_benchmark_raw.filter(col("date").isNull()).count(),    # Null date values
    "null_open": df_benchmark_raw.filter(col("open").isNull()).count()     # Null open prices
}

# Create a summary DataFrame of null counts for reporting
df_bench_null_summary = spark.createDataFrame(
    [(k, v) for k, v in null_counts_benchmark.items()],
    ["null_type", "count"]
)

display(df_bench_null_summary)

BENCHMARK INDEX — NULL COUNT


null_type,count
null_close,21
null_date,0
null_open,27


In [0]:
# Report invalid value counts in benchmark index columns
print("BENCHMARK INDEX — INVALID VALUE COUNTS")

invalid_counts_benchmark = {
    # 'close' column contains text 'NaN'
    "invalid_close_text": df_benchmark_raw.filter(col("close").cast("string") == "NaN").count(),
    # 'open' column contains text 'NaN'
    "invalid_open_text": df_benchmark_raw.filter(col("open").cast("string") == "NaN").count(),
    # Negative values in 'close' column
    "negative_close": df_benchmark_raw.filter(col("close") < 0).count(),
    # Negative values in 'open' column
    "negative_open": df_benchmark_raw.filter(col("open") < 0).count(),
    # Zero values in 'close' column
    "zero_close": df_benchmark_raw.filter(col("close") == 0).count(),
    # Zero values in 'open' column
    "zero_open": df_benchmark_raw.filter(col("open") == 0).count()
}

# Create summary DataFrame of invalid value counts for benchmark index
df_bench_invalid_summary = spark.createDataFrame(
    [(k, v) for k, v in invalid_counts_benchmark.items()],
    ["invalid_type", "count"]
)

display(df_bench_invalid_summary)

BENCHMARK INDEX — INVALID VALUE COUNTS


invalid_type,count
invalid_close_text,0
invalid_open_text,0
negative_close,15
negative_open,12
zero_close,0
zero_open,0


**BRONZE TABLES**

In [0]:
# Define schema for bronze tables
bronze_schema = "workspace.stock_project"

# Save raw stock prices DataFrame as Delta table
df_stock_raw.write.mode("overwrite").format("delta").option("overwriteSchema", True).saveAsTable(f"{bronze_schema}.bronze_stock_prices")

# Save raw portfolio transactions DataFrame as Delta table
df_portfolio_raw.write.mode("overwrite").format("delta").option("overwriteSchema", True).saveAsTable(f"{bronze_schema}.bronze_portfolio_transactions")

# Save raw company sector DataFrame as Delta table
df_company_raw.write.mode("overwrite").format("delta").option("overwriteSchema", True).saveAsTable(f"{bronze_schema}.bronze_company_sector")

# Save raw benchmark index DataFrame as Delta table
df_benchmark_raw.write.mode("overwrite").format("delta").option("overwriteSchema", True).saveAsTable(f"{bronze_schema}.bronze_benchmark_index")

**Bronze validation log**

In [0]:
from pyspark.sql.functions import col

# Function to perform basic validation checks on a DataFrame
def basic_validation(df, table_name):
    return spark.createDataFrame([
        (
            table_name,  # Name of the table
            df.count(),  # Total number of rows
            df.dropDuplicates().count(),  # Number of distinct rows
            sum([df.filter(col(c).isNull()).count() for c in df.columns]),  # Total null values across all columns
            ", ".join([f"{c}:{df.schema[c].dataType.simpleString()}" for c in df.columns])  # Schema summary
        )
    ], [table_name, "total_rows", "distinct_rows", "null_values", "schema"])

# Aggregate validation logs for all bronze tables
log_df = (
    basic_validation(df_stock_raw, "bronze_stock_prices")
    .union(basic_validation(df_portfolio_raw, "bronze_portfolio_transactions"))
    .union(basic_validation(df_company_raw, "bronze_company_sector"))
    .union(basic_validation(df_benchmark_raw, "bronze_benchmark_index"))
)

display(log_df)

bronze_stock_prices,total_rows,distinct_rows,null_values,schema
bronze_stock_prices,63000,63000,258,"date:date, ticker:string, open:double, high:double, low:double, close:double, volume:double"
bronze_portfolio_transactions,30000,30000,102,"transaction_id:int, date:date, ticker:string, action:string, quantity:double, price:double"
bronze_company_sector,31,31,3,"ticker:string, sector:string"
bronze_benchmark_index,2100,2100,48,"date:date, open:double, close:double"


In [0]:
#saving validation log

log_df.write.mode("overwrite").format("delta").saveAsTable("workspace.stock_project.bronze_validation_log")

In [0]:
# Exit the notebook with a status message for workflow orchestration
dbutils.notebook.exit("SUCCESS")