<h3>Amendments Log</h3>
<table style="width:100%">
  <thead>
    <tr>
      <th style="text-align:left">Version</th>
      <th style="text-align:left">Amended By</th>
      <th style="text-align:left">Date</th>
      <th style="text-align:left">Description</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>1.0</td>
      <td>Gary Manley</td>
      <td>2025-11-30</td>
      <td>Initial Version</td>
    </tr>
  </tbody>
</table>

# Ingest: UK Movie Releases

**Objective:** Ingest UK Movie Release dates from IMDb hidden JSON data.

**Scope:** Extract data and load it into `MovieReleases.landing.uk_releases`.

In [None]:
# 1. SETUP & IMPORTS
import duckdb
import pandas as pd
import requests
import os
import sys
import json
import re
from datetime import datetime
from dotenv import load_dotenv

# Load Utils
sys.path.append(os.getcwd())
try:
    from utils.db_utils import f_load_to_landing
except ImportError:
    print("Error: Could not import utils")

# Load Env (Smart Load)
vLocalEnvPath = r"C:\Users\garym\Documents\GitHub\MovieReleases\.env"
if os.path.exists(vLocalEnvPath):
    load_dotenv(dotenv_path=vLocalEnvPath)
else:
    load_dotenv()

vMdToken = os.getenv("MOTHERDUCK_TOKEN")
if not vMdToken: raise RuntimeError("MOTHERDUCK_TOKEN missing")

# Connect
print("Connecting to MotherDuck...")
vCon = duckdb.connect(f"md:?motherduck_token={vMdToken}")
vCon.sql("CREATE DATABASE IF NOT EXISTS MovieReleases")

In [None]:
# PARAMETERS / CONSTANTS
cNotebookName = "ingest_releases.ipynb"

## 2. Extract Logic (IMDb)
We use regex to find the `__NEXT_DATA__` JSON blob embedded in the IMDb page source.

In [None]:
def fetch_uk_releases_imdb():
    vUrl = "https://www.imdb.com/calendar/?region=GB&type=MOVIE"
    vHeaders = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/123.0.0.0 Safari/537.36", "Accept-Language": "en-GB"}
    
    try:
        vResponse = requests.get(vUrl, headers=vHeaders)
        vPattern = r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>'
        vMatch = re.search(vPattern, vResponse.text, flags=re.S)
        
        if not vMatch: return pd.DataFrame()
        
        vRawJson = json.loads(vMatch.group(1))
        vGroups = vRawJson.get('props', {}).get('pageProps', {}).get('groups', [])
        
        vAllMovies = []
        for vGroup in vGroups:
            vEntries = vGroup.get('entries', [])
            for vEntry in vEntries:
                # Parsing Logic
                vTitleNode = vEntry.get('titleText')
                vTitle = None
                
                # Robustly handle Title being a dict or string
                if isinstance(vTitleNode, dict):
                    vTitle = vTitleNode.get('text')
                elif isinstance(vTitleNode, str):
                    vTitle = vTitleNode 
                
                vImdbId = vEntry.get('id')
                
                # Date Parsing
                vRawDate = vEntry.get('releaseDate')
                vDateStr = None
                if isinstance(vRawDate, dict):
                    try:
                        vDateStr = f"{vRawDate.get('year')}-{vRawDate.get('month'):02d}-{vRawDate.get('day'):02d}"
                    except: pass
                else:
                    vDateStr = str(vRawDate)
                
                if vTitle and vDateStr:
                    vAllMovies.append({
                        "uk_release_id": f"{vImdbId}_{vDateStr}",
                        "movie_title": vTitle,
                        "release_date": vDateStr,
                        "imdb_id_ref": vImdbId
                    })
                    
        dfMovies = pd.DataFrame(vAllMovies)
        if dfMovies.empty: return dfMovies
        
        # Clean
        dfMovies['release_date'] = pd.to_datetime(dfMovies['release_date']).dt.date
        dfMovies = dfMovies[dfMovies['release_date'] >= datetime.now().date()]
        return dfMovies
        
    except Exception as e:
        print(f"Error: {e}")
        return pd.DataFrame()

## 3. Execution: Load to Landing
Ingest raw data into `MovieReleases.landing.uk_releases`. This overwrites the previous landing data.

In [None]:
dfReleases = fetch_uk_releases_imdb()

if not dfReleases.empty:
    f_load_to_landing(vCon, dfReleases, "MovieReleases.landing", "uk_releases")
    print("Landing load complete.")
else:
    print("No data found.")
    
vCon.close()