<h3>Amendments Log</h3>
<table style="width:100%">
  <thead>
    <tr>
      <th style="text-align:left">Version</th>
      <th style="text-align:left">Amended By</th>
      <th style="text-align:left">Date</th>
      <th style="text-align:left">Description</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>1.6</td>
      <td>Gary Manley</td>
      <td>2025-12-07</td>
      <td>Applied CI/CD fixes: Cleaned JSON IDs, added 'import sys'.</td>
    </tr>
    <tr>
      <td>1.5</td>
      <td>Gary Manley</td>
      <td>2025-12-02</td>
      <td>Added extended metadata (Poster, Genres, Cast) to the dimension.</td>
    </tr>
    <tr>
      <td>1.4</td>
      <td>Gary Manley</td>
      <td>2025-11-30</td>
      <td>Removed source_type column as it does not exist in Bronze.</td>
    </tr>
    <tr>
      <td>1.3</td>
      <td>Gary Manley</td>
      <td>2025-11-30</td>
      <td>Fixed Source Table path (pointed to 'bronze' schema) and removed resiliency checks.</td>
    </tr>
    <tr>
      <td>1.2</td>
      <td>Gary Manley</td>
      <td>2025-11-30</td>
      <td>Refactored to use Pandas for deduplication logic instead of SQL.</td>
    </tr>
    <tr>
      <td>1.1</td>
      <td>Gary Manley</td>
      <td>2025-11-30</td>
      <td>Updated deduplication logic to prioritize latest release date</td>
    </tr>
    <tr>
      <td>1.0</td>
      <td>Gary Manley</td>
      <td>2025-11-30</td>
      <td>Initial Version</td>
    </tr>
  </tbody>
</table>

In [None]:
# 1. SETUP & IMPORTS
import duckdb
import pandas as pd
import os
import sys
from dotenv import load_dotenv

# Load Utils
sys.path.append(os.getcwd())
# Import DQ Utils & DB Utils
try:
    from utils.db_utils import f_add_surrogate_key
    from utils.dq_utils import (
        f_check_duplicate_rows, 
        f_check_duplicate_keys
    )
except ImportError:
    print("Error: Could not import utils")

# Load Env
vLocalEnvPath = r"C:/Users/garym/Documents/GitHub/MovieReleases/.env"
if os.path.exists(vLocalEnvPath):
    load_dotenv(dotenv_path=vLocalEnvPath)
else:
    load_dotenv()

vMdToken = os.getenv("MOTHERDUCK_TOKEN")
if not vMdToken: raise RuntimeError("MOTHERDUCK_TOKEN missing")

# Connect
print("Connecting to MotherDuck...")
vCon = duckdb.connect(f"md:?motherduck_token={vMdToken}")

In [None]:
# PARAMETERS / CONSTANTS
cNotebookName = "process_dim_film.ipynb"
vTargetTable = "MovieReleases.silver.film_release_dim"

## 2. Extract & Deduplicate (Pandas)
We read the active Bronze history into a Pandas DataFrame and deduplicate using Python.
We group by `imdb_id_ref` and keep the row with the most recent `valid_from_uda` (System Entry Date).

In [None]:
# 1. Fetch Active Bronze Data
print("Fetching active records from Bronze...")
try:
    # Corrected Path: Read from 'bronze' schema which has the SCD2 columns
    dfBronze = vCon.table("MovieReleases.bronze.uk_releases").df()
except Exception as e:
    print(f"Error reading source table: {e}")
    dfBronze = pd.DataFrame()

if not dfBronze.empty:
    # 2. Filter Active
    dfActive = dfBronze[dfBronze['is_current_uda'] == True].copy()
    
    # 3. Deduplicate (Pandas Logic)
    # Sort by 'valid_from_uda' descending to put the latest system entry at the top
    dfSorted = dfActive.sort_values(by='valid_from_uda', ascending=False)
    
    # Drop duplicates on Business Key (imdb_id_ref), keeping the first (latest)
    dfDedup = dfSorted.drop_duplicates(subset=['imdb_id_ref'], keep='first').copy()
    
    # 4. Prepare Source Dataframe
    # Updated to include richer metadata from Bronze
    vRequiredCols = ['imdb_id_ref', 'movie_title', 'poster_url', 'genres', 'cast_members']
    
    # Resiliency: Ensure cols exist (in case Bronze hasn't been reloaded yet)
    vAvailableCols = [c for c in vRequiredCols if c in dfDedup.columns]
    dfSource = dfDedup[vAvailableCols].copy()
    
    # Ensure we don't have blank IDs
    dfSource = dfSource.dropna(subset=['imdb_id_ref'])
    
    print(f"Found {len(dfSource)} unique movies to process.")

    # 5. Generate/Maintain Surrogate Keys
    dfDimFilm = f_add_surrogate_key(
        vCon=vCon,
        dfNewData=dfSource,
        vTargetTableName=vTargetTable,
        vBusinessKeyCol="imdb_id_ref",
        vSkColName="sk_film_release"
    )
    
    # --- DATA QUALITY CHECKS ---
    print("Running DQ Checks...")
    
    # Check 1: No Duplicate Rows
    f_check_duplicate_rows(vCon, dfDimFilm, cNotebookName, "Silver", vTargetTable)
    
    # Check 2: Unique Surrogate Keys
    f_check_duplicate_keys(vCon, dfDimFilm, ['sk_film_release'], cNotebookName, "Silver", vTargetTable)
    
    # Check 3: Unique Business Keys (1 row per movie)
    f_check_duplicate_keys(vCon, dfDimFilm, ['imdb_id_ref'], cNotebookName, "Silver", vTargetTable)

    # 6. Load to Silver (Replace Table)
    print(f"Checks passed. Loading to {vTargetTable}...")
    vCon.sql("CREATE SCHEMA IF NOT EXISTS MovieReleases.silver")
    vCon.register('v_stage_dim_film', dfDimFilm)
    vCon.sql(f"CREATE OR REPLACE TABLE {vTargetTable} AS SELECT * FROM v_stage_dim_film")
    
    print("Success.")
    # Validation
    vCon.sql(f"SELECT * FROM {vTargetTable} LIMIT 5").show()

else:
    print("No data found in Bronze. Skipping Silver load.")

vCon.close()