## Null value check for table [bronze_daily]

In [0]:
from pyspark.sql.functions import col, sum

df = spark.read.table("kenworkspace.tw_stocks_db.bronze_daily")

# 假設 df 是你的 DataFrame
null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
null_counts.show()


##Null value check for table [bronze_monthly]

In [0]:
df = spark.read.table("kenworkspace.tw_stocks_db.bronze_monthly")

# 假設 df 是你的 DataFrame
null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
null_counts.show()

## Feature engineering for table [bronze_monthly]
### 1. Monthly volume record high [MVRH]
### 2. Distance (%) from last MVRH [from_MVRH]

In [0]:
%sql
CREATE OR REPLACE TABLE kenworkspace.tw_stocks_db.silver_mvrh_monthly AS
  WITH base AS (
    SELECT
      `date`,
      `open`,
      `high`,
      `low`,
      `close`,
      `volume`,
      MAX(`volume`) OVER (ORDER BY `date`) AS max_volume_so_far
    FROM kenworkspace.tw_stocks_db.bronze_monthly
  ),
  mvrh_flagged AS (
    SELECT *,
      volume = max_volume_so_far AS mvrh
    FROM base
  ),
  add_previous_mvrh AS (
    SELECT *,
      -- 找出每一筆資料之前的最近一次 MVRH 的 volume（排除自己）
      LAG(
        CASE WHEN mvrh THEN volume ELSE NULL END
      ) IGNORE NULLS OVER (ORDER BY `date`) AS last_mvrh_volume
    FROM mvrh_flagged
  ),
  final AS (
    SELECT *,
      -- 計算與前一次 MVRH 的百分比差距
      CASE 
        WHEN last_mvrh_volume IS NOT NULL THEN 
          ROUND((volume - last_mvrh_volume) / last_mvrh_volume * 100, 2)
        ELSE NULL 
      END AS from_mvrh_percent
    FROM add_previous_mvrh
  )
  SELECT * FROM final;

In [0]:
%sql
use catalog `kenworkspace`; select * from `tw_stocks_db`.`silver_mvrh_monthly` limit 100;