<h3>Amendments Log</h3>
<table style="width:100%">
  <thead>
    <tr>
      <th style="text-align:left">Version</th>
      <th style="text-align:left">Amended By</th>
      <th style="text-align:left">Date</th>
      <th style="text-align:left">Description</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>1.1</td>
      <td>Gary Manley</td>
      <td>2025-12-02</td>
      <td>Added Genre Dim and Bridge processing.</td>
    </tr>
    <tr>
      <td>1.0</td>
      <td>Gary Manley</td>
      <td>2025-12-02</td>
      <td>Initial Version: Actor Dim and Bridge Table</td>
    </tr>
  </tbody>
</table>

In [None]:
# 1. SETUP & IMPORTS
import duckdb
import pandas as pd
import os
import sys
from dotenv import load_dotenv

# Load Utils
sys.path.append(os.getcwd())
try:
    from utils.db_utils import f_add_surrogate_key
    from utils.dq_utils import (
        f_check_duplicate_rows, 
        f_check_duplicate_keys
    )
except ImportError:
    print("Error: Could not import utils")

# Load Env
vLocalEnvPath = r"C:/Users/garym/Documents/GitHub/MovieReleases/.env"
if os.path.exists(vLocalEnvPath):
    load_dotenv(dotenv_path=vLocalEnvPath)
else:
    load_dotenv()

vMdToken = os.getenv("MOTHERDUCK_TOKEN")
if not vMdToken: raise RuntimeError("MOTHERDUCK_TOKEN missing")

# Connect
print("Connecting to MotherDuck...")
vCon = duckdb.connect(f"md:?motherduck_token={vMdToken}")

In [None]:
# PARAMETERS / CONSTANTS
cNotebookName = "process_bridge_actor.ipynb"

# Actor Targets
vTargetActorDim = "MovieReleases.silver.actor_dim"
vTargetActorBridge = "MovieReleases.silver.film_actor_bridge"

# Genre Targets
vTargetGenreDim = "MovieReleases.silver.genre_dim"
vTargetGenreBridge = "MovieReleases.silver.film_genre_bridge"

## 2. Explode Cast & Build Actor Dim

**Logic:**
1. Read `silver.film_release_dim`.
2. Split `cast_members` (comma separated) into individual rows.
3. Extract unique Actor Names to build `actor_dim`.
4. Generate `sk_actor`.

In [None]:
print("Fetching Film Dimension...")
try:
    dfFilmDim = vCon.table("MovieReleases.silver.film_release_dim").df()
except Exception as e:
    print(f"Error reading film dim: {e}")
    dfFilmDim = pd.DataFrame()

if not dfFilmDim.empty:
    # --- ACTOR PROCESSING ---
    # 1. Prepare Exploded List
    dfCast = dfFilmDim[['sk_film_release', 'cast_members']].dropna(subset=['cast_members']).copy()
    
    # Convert comma-string to list, then explode
    dfCast['actor_name'] = dfCast['cast_members'].str.split(', ')
    dfExploded = dfCast.explode('actor_name')
    
    # Trim whitespace just in case
    dfExploded['actor_name'] = dfExploded['actor_name'].str.strip()
    
    # 2. Build Unique Actor List for Dimension
    dfUniqueActors = dfExploded[['actor_name']].drop_duplicates().sort_values('actor_name')
    dfUniqueActors = dfUniqueActors[dfUniqueActors['actor_name'] != '']
    
    print(f"Found {len(dfUniqueActors)} unique actors.")

    # 3. Generate SKs
    dfActorDim = f_add_surrogate_key(
        vCon=vCon,
        dfNewData=dfUniqueActors,
        vTargetTableName=vTargetActorDim,
        vBusinessKeyCol="actor_name",
        vSkColName="sk_actor"
    )
    
    # 4. Load Actor Dim
    print(f"Loading {vTargetActorDim}...")
    vCon.register('v_stage_actor', dfActorDim)
    vCon.sql(f"CREATE OR REPLACE TABLE {vTargetActorDim} AS SELECT * FROM v_stage_actor")
    
    # 5. Create Bridge Table
    print("Building Actor Bridge Table...")
    
    dfBridge = pd.merge(
        dfExploded[['sk_film_release', 'actor_name']],
        dfActorDim[['sk_actor', 'actor_name']],
        on='actor_name',
        how='inner'
    )
    
    dfBridgeFinal = dfBridge[['sk_film_release', 'sk_actor']].drop_duplicates()
    print(f"Actor Bridge Rows: {len(dfBridgeFinal)}")
    
    # --- DQ CHECKS (Bridge) ---
    f_check_duplicate_rows(vCon, dfBridgeFinal, cNotebookName, "Silver", vTargetActorBridge)
    
    # 6. Load Bridge
    print(f"Loading {vTargetActorBridge}...")
    vCon.register('v_stage_bridge', dfBridgeFinal)
    vCon.sql(f"CREATE OR REPLACE TABLE {vTargetActorBridge} AS SELECT * FROM v_stage_bridge")
    
    print("Actor processing success.")
    
else:
    print("Film Dimension empty or missing. Skipping.")

## 3. Explode Genres & Build Genre Dim

**Logic:**
1. Read `silver.film_release_dim` (Loaded in memory).
2. Split `genres` (comma separated) into individual rows.
3. Extract unique Genre Names to build `genre_dim`.
4. Generate `sk_genre`.
5. Build `film_genre_bridge`.

In [None]:
if not dfFilmDim.empty:
    # --- GENRE PROCESSING ---
    # 1. Prepare Exploded List
    dfGenres = dfFilmDim[['sk_film_release', 'genres']].dropna(subset=['genres']).copy()
    
    # Convert comma-string to list, then explode
    dfGenres['genre_name'] = dfGenres['genres'].str.split(', ')
    dfExplodedGenres = dfGenres.explode('genre_name')
    
    # Trim whitespace
    dfExplodedGenres['genre_name'] = dfExplodedGenres['genre_name'].str.strip()
    
    # 2. Build Unique Genre List for Dimension
    dfUniqueGenres = dfExplodedGenres[['genre_name']].drop_duplicates().sort_values('genre_name')
    dfUniqueGenres = dfUniqueGenres[dfUniqueGenres['genre_name'] != '']
    
    print(f"Found {len(dfUniqueGenres)} unique genres.")

    # 3. Generate SKs
    dfGenreDim = f_add_surrogate_key(
        vCon=vCon,
        dfNewData=dfUniqueGenres,
        vTargetTableName=vTargetGenreDim,
        vBusinessKeyCol="genre_name",
        vSkColName="sk_genre"
    )
    
    # 4. Load Genre Dim
    print(f"Loading {vTargetGenreDim}...")
    vCon.register('v_stage_genre', dfGenreDim)
    vCon.sql(f"CREATE OR REPLACE TABLE {vTargetGenreDim} AS SELECT * FROM v_stage_genre")
    
    # 5. Create Bridge Table
    print("Building Genre Bridge Table...")
    
    dfGenreBridge = pd.merge(
        dfExplodedGenres[['sk_film_release', 'genre_name']],
        dfGenreDim[['sk_genre', 'genre_name']],
        on='genre_name',
        how='inner'
    )
    
    dfGenreBridgeFinal = dfGenreBridge[['sk_film_release', 'sk_genre']].drop_duplicates()
    print(f"Genre Bridge Rows: {len(dfGenreBridgeFinal)}")
    
    # --- DQ CHECKS (Bridge) ---
    f_check_duplicate_rows(vCon, dfGenreBridgeFinal, cNotebookName, "Silver", vTargetGenreBridge)
    
    # 6. Load Bridge
    print(f"Loading {vTargetGenreBridge}...")
    vCon.register('v_stage_genre_bridge', dfGenreBridgeFinal)
    vCon.sql(f"CREATE OR REPLACE TABLE {vTargetGenreBridge} AS SELECT * FROM v_stage_genre_bridge")
    
    print("Genre processing success.")

vCon.close()