In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

silver_schema = "workspace.stock_project"
gold_schema = "workspace.stock_project"


**DAILY STOCK METRICS**

In [0]:
df_stock = spark.table(f"{silver_schema}.silver_stock_prices")

w = Window.partitionBy("ticker").orderBy("date")

df_gold_stock_metrics = (
    df_stock
    .withColumn("prev_close", F.lag("close").over(w))
    .withColumn(
        "daily_return",
        F.try_divide(F.col("close") - F.col("prev_close"), F.col("prev_close"))
    )
    .withColumn("ma_7", F.avg("close").over(w.rowsBetween(-6, 0)))
    .withColumn("ma_30", F.avg("close").over(w.rowsBetween(-29, 0)))
    .withColumn("volatility_7", F.stddev("daily_return").over(w.rowsBetween(-6, 0)))
    .withColumn("daily_range", F.col("high") - F.col("low"))
    .drop("prev_close")
)

df_gold_stock_metrics.write.partitionBy("ticker").mode("overwrite").saveAsTable(
    f"{gold_schema}.gold_daily_stock_metrics"
)

display(df_gold_stock_metrics.limit(10 ))

date,ticker,open,close,volume,high,low,daily_return,ma_7,ma_30,volatility_7,daily_range
2021-01-01,AAPL,141.77,141.91,4852219,144.59,141.34,,141.91,141.91,,3.25
2021-01-02,AAPL,146.01,146.34,1232267,148.04,144.67,0.0312169685011627,144.125,144.125,,3.3700000000000045
2021-01-03,AAPL,143.79,142.75,3742555,143.88,140.75,-0.0245319119857865,143.66666666666666,143.66666666666666,0.0394204114358802,3.1299999999999955
2021-01-04,AAPL,141.6,142.35,450325,142.98,140.16,-0.0028021015761821,143.3375,143.3375,0.0280992866603193,2.819999999999993
2021-01-05,AAPL,146.26,144.26,2330027,146.95,142.91,0.0134176325957147,143.522,143.522,0.0237302260386378,4.039999999999992
2021-01-06,AAPL,146.2,143.06,4916393,149.05,142.72,-0.0083183141549978,143.44500000000002,143.44500000000002,0.0213146461920245,6.3300000000000125
2021-01-07,AAPL,146.45,149.37,4811653,150.61,145.35,0.0441073675380959,144.29142857142858,144.29142857142858,0.0257258664584339,5.260000000000019
2021-01-08,AAPL,142.14,142.09,3847122,143.8,142.01,-0.0487380330722367,144.31714285714287,144.01625,0.0320196380941772,1.7900000000000205
2021-01-09,AAPL,142.13,141.36,4887975,142.87,140.0,-0.0051375888521358,143.6057142857143,143.72111111111113,0.0290397186896001,2.8700000000000045
2021-01-12,AAPL,139.48,138.14,3231031,142.31,137.22,-0.0227787209960386,142.94714285714286,143.163,0.0288457927296755,5.090000000000003


**PORTFOLIO HOLDINGS**

In [0]:
df_port = spark.table(f"{silver_schema}.silver_portfolio")
df_stock = spark.table(f"{silver_schema}.silver_stock_prices")

# Net quantity per date
df_port_net = (
    df_port.groupBy("ticker", "date")
    .agg(
        F.sum(
            F.when(F.col("action") == "BUY", F.col("quantity"))
             .when(F.col("action") == "SELL", -F.col("quantity"))
             .otherwise(0)
        ).alias("net_quantity")
    )
)

# Rolling quantity held
w = Window.partitionBy("ticker").orderBy("date")
df_holdings = (
    df_port_net
    .withColumn("quantity_held", F.sum("net_quantity").over(w))
    .withColumn(
        "quantity_held",
        F.when(F.col("quantity_held") < 0, 0)
         .otherwise(F.col("quantity_held"))
    )
)

# Join with stock close price for market value
df_gold_holdings = (
    df_holdings
    .join(df_stock.select("ticker", "date", "close"), ["ticker", "date"], "left")
    .withColumn("market_value", F.col("quantity_held") * F.col("close"))
)

df_gold_holdings.write.partitionBy("date").mode("overwrite").format("delta").saveAsTable(
    f"{gold_schema}.gold_portfolio_holdings"
)

display(df_gold_holdings.limit(10))

ticker,date,net_quantity,quantity_held,close,market_value
AAPL,2021-01-01,-273,0,141.91,0.0
AAPL,2021-01-02,446,173,146.34,25316.82
AAPL,2021-01-05,-295,0,144.26,0.0
AAPL,2021-01-10,84,0,,
AAPL,2021-01-11,-334,0,,
AAPL,2021-01-12,-191,0,138.14,0.0
AAPL,2021-01-13,52,0,137.62,0.0
AAPL,2021-01-15,529,18,135.06,2431.08
AAPL,2021-01-16,-232,0,143.82,0.0
AAPL,2021-01-17,175,0,150.56,0.0


**SECTOR PERFORMANCE**

In [0]:

df_company = spark.table(f"{silver_schema}.silver_company_sector")
df_stock_metrics = spark.table(f"{gold_schema}.gold_daily_stock_metrics")

df_gold_sector = (
    df_stock_metrics
    .join(df_company, "ticker", "left")
    .groupBy("sector", "date")
    .agg(
        F.avg("daily_return").alias("avg_sector_return"),
        F.stddev("daily_return").alias("sector_volatility"),
        F.sum("volume").alias("sector_volume")
    )
)

df_gold_sector.write.mode("overwrite").format("delta").saveAsTable(
    f"{gold_schema}.gold_sector_performance"
)

display(df_gold_sector.limit(10))

sector,date,avg_sector_return,sector_volatility,sector_volume
Finance,2024-04-29,0.0059593438719007,0.0327950416562556,14721876
Automotive,2022-05-01,-0.005637685089384,0.0162058083207161,22209586
Automotive,2024-06-08,-0.0206022822071953,0.0144019894223718,8585233
Finance,2023-06-13,0.0031706008609195,0.0150347493394178,25938058
Finance,2024-06-27,0.0167538909912239,0.0203766592275115,12507730
Retail,2021-01-27,0.0166135488409056,0.0207119141204913,5415559
Retail,2022-09-20,0.0041783176096581,0.011286537719412,17670222
Retail,2024-08-23,-0.0047703875055806,0.0302767679688968,9689814
Automotive,2021-04-21,0.0144366912573152,0.0309183270774271,7466546
Retail,2023-05-24,-0.0052538494879341,0.0195291041033877,20862510


**PORTFOLIO TRADES FACT**

In [0]:
df_port = spark.table(f"{silver_schema}.silver_portfolio")

df_gold_trades = df_port.withColumn(
    "trade_value", F.col("quantity") * F.col("price")
)

df_gold_trades.write.mode("overwrite").format("delta").saveAsTable(
    f"{gold_schema}.gold_portfolio_trades_fact"
)

display(df_gold_trades.limit(10))

transaction_id,date,ticker,action,quantity,price,trade_value
1,2022-09-17,PFE,SELL,294,223.2,65620.8
2,2022-02-25,AMZN,SELL,168,284.82,47849.76
3,2021-04-23,TGT,SELL,463,459.46,212729.98
4,2021-02-16,PEP,BUY,482,70.69,34072.58
5,2021-07-01,BAC,SELL,344,310.09,106670.96
6,2022-09-21,INTC,SELL,365,259.81,94830.65
7,2021-09-21,META,BUY,179,265.47,47519.130000000005
8,2022-03-16,AVGO,SELL,259,373.96,96855.64
9,2021-05-10,AVGO,SELL,486,157.84,76710.24
11,2021-11-09,JPM,BUY,416,499.7,207875.2


**BENCHMARK COMPARISION**

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

df_bench = spark.table(f"{silver_schema}.silver_benchmark_index")
df_holdings = spark.table(f"{gold_schema}.gold_portfolio_holdings")

#----------------------------------------------
# 1. Compute Benchmark Daily Return
#----------------------------------------------
df_bench_keyed = df_bench.withColumn("key", F.lit(1))
w_bench = Window.partitionBy("key").orderBy("date")
df_bench2 = (
    df_bench_keyed
    .withColumn("prev_close", F.lag("close").over(w_bench))
    .withColumn(
        "benchmark_return",
        F.try_divide(
            F.col("close") - F.col("prev_close"),
            F.col("prev_close")
        )
    )
    .drop("key")
)

#----------------------------------------------
# 2. Compute Portfolio Daily Return (safe)
#----------------------------------------------
df_hold_keyed = df_holdings.withColumn("key", F.lit(1))
w_port = Window.partitionBy("key").orderBy("date")
df_port_ret = (
    df_hold_keyed
    .groupBy("date", "key")
    .agg(F.sum("market_value").alias("portfolio_value"))
    .withColumn("prev_value", F.lag("portfolio_value").over(w_port))
    .withColumn(
        "portfolio_return",
        F.try_divide(
            F.col("portfolio_value") - F.col("prev_value"),
            F.col("prev_value")
        )
    )
    .drop("key")
)

#----------------------------------------------
# 3. Join Portfolio vs Benchmark
#----------------------------------------------
df_gold_compare = (
    df_port_ret
    .join(df_bench2.select("date", "benchmark_return"), "date", "left")
    .withColumn(
        "excess_return",
        F.col("portfolio_return") - F.col("benchmark_return")
    )
)

df_gold_compare.write.mode("overwrite").format("delta").saveAsTable(
    f"{gold_schema}.gold_benchmark_comparison"
)

display(df_gold_compare.limit(10))

date,portfolio_value,prev_value,portfolio_return,benchmark_return,excess_return
2021-01-01,543812.15,,,,
2021-01-02,619630.7999999999,543812.15,0.1394206620797271,0.0020139638625947,0.1374066982171324
2021-01-03,753405.1000000001,619630.7999999999,0.215893561133501,0.0060468051717667,0.2098467559617343
2021-01-04,456384.41,753405.1000000001,-0.3942376949664928,0.0030801140489936,-0.3973178090154864
2021-01-05,336673.79,456384.41,-0.2623021675959528,0.0062201913948384,-0.2685223589907913
2021-01-06,1001499.65,336673.79,1.9746884959473685,0.0052914348747554,1.9693970610726128
2021-01-07,412829.31,1001499.65,-0.5877888624324532,-0.0015400853870126,-0.5862487770454405
2021-01-08,962427.22,412829.31,1.3312957599837083,-0.0029287232440212,1.3342244832277297
2021-01-09,972387.81,962427.22,0.010349447514587,-0.0011637405306208,0.0115131880452079
2021-01-10,567763.15,972387.81,-0.4161144924266378,0.0027250932637262,-0.4188395856903641


**RISK INDICATORS**

In [0]:
df_stock = spark.table(f"{gold_schema}.gold_daily_stock_metrics")

w = Window.partitionBy("ticker").orderBy("date")

df_gold_risk = (
    df_stock
    .withColumn(
        "volatility_30d",
        F.stddev("daily_return").over(w.rowsBetween(-29, 0))
    )
    .withColumn(
        "downside_deviation",
        F.sqrt(
            F.avg(
                F.when(F.col("daily_return") < 0, F.col("daily_return") ** 2)
                 .otherwise(0)
            ).over(w.rowsBetween(-29, 0))
        )
    )
    .withColumn(
        "rolling_sharpe",
        F.col("daily_return") / F.col("volatility_30d")
    )
)

df_gold_risk.write.mode("overwrite").format("delta").saveAsTable(
    f"{gold_schema}.gold_risk_indicators"
)

display(df_gold_risk.limit(10))

date,ticker,open,close,volume,high,low,daily_return,ma_7,ma_30,volatility_7,daily_range,volatility_30d,downside_deviation,rolling_sharpe
2021-01-01,GOOGL,261.07,259.3,2844268,262.91,258.4,,259.3,259.3,,4.510000000000048,,0.0,
2021-01-02,GOOGL,267.15,264.7,4009376,267.31,262.24,0.0208252988816042,262.0,262.0,,5.069999999999993,,0.0,
2021-01-03,GOOGL,260.8,256.88,2749803,262.01,255.24,-0.0295428787306384,260.2933333333333,260.2933333333333,0.0356156799456252,6.769999999999982,0.0356156799456252,0.0170565889877705,-0.8294907966306359
2021-01-04,GOOGL,265.82,263.05,3634873,266.46,262.32,0.0240189971971349,260.9825,260.9825,0.0300444880630131,4.139999999999986,0.0300444880630131,0.0147714393653192,0.7994477105670501
2021-01-05,GOOGL,260.62,262.82,3910730,265.07,257.91,-0.0008743584869797307,261.35,261.35,0.0247124562348324,7.159999999999968,0.0247124562348324,0.0132177621892433,-0.0353812862093131
2021-01-06,GOOGL,258.74,255.41,874561,260.05,253.05,-0.0281942013545392,260.36,260.36,0.025696097144579,7.0,0.025696097144579,0.0166756268071961,-1.097217262057527
2021-01-07,GOOGL,260.3,256.8,4368067,262.57,255.0,0.0054422301397753,259.8514285714286,259.8514285714286,0.0232255533882533,7.569999999999993,0.0232255533882533,0.0154386304744081,0.2343207952378786
2021-01-08,GOOGL,263.79,261.4,739792,264.28,260.9,0.0179127725856696,260.15142857142854,260.045,0.0224217800707695,3.3799999999999955,0.0224217800707695,0.0144415164390606,0.7989005569197382
2021-01-09,GOOGL,265.93,264.35,655828,266.07,262.15,0.0112853863810254,260.1014285714286,260.5233333333333,0.0213042105023711,3.920000000000016,0.0210524791878684,0.0136155922729023,0.5360597334079605
2021-01-10,GOOGL,267.27,265.05,2657665,269.92,262.92,0.0026480045394363,261.2685714285714,260.976,0.0168768550729959,7.0,0.0196927953234823,0.012916884982368,0.1344656508098034


In [0]:
dbutils.notebook.exit("SUCCESS")
