# Ingest: UK Movie Releases

**Objective:** Ingest UK Movie Release dates from IMDb hidden JSON data.

**Scope:** Extract data and load it into `MovieReleases.landing.uk_releases`. 
This notebook also registers the configuration for the downstream processor.

In [1]:
# 1. SETUP & IMPORTS
import duckdb
import pandas as pd
import requests
import os
import sys
import json
import re
from datetime import datetime
from dotenv import load_dotenv

# Load Utils
sys.path.append(os.getcwd())
try:
    from utils.db_utils import f_load_to_landing
except ImportError:
    print("Error: Could not import utils")

# Load Env (Smart Load)
vLocalEnvPath = r"C:\Users\garym\Documents\GitHub\MovieReleases\.env"
if os.path.exists(vLocalEnvPath):
    load_dotenv(dotenv_path=vLocalEnvPath)
else:
    load_dotenv()

vMdToken = os.getenv("MOTHERDUCK_TOKEN")
if not vMdToken: raise RuntimeError("MOTHERDUCK_TOKEN missing")

# Connect
print("Connecting to MotherDuck...")
con = duckdb.connect(f"md:?motherduck_token={vMdToken}")
con.sql("CREATE DATABASE IF NOT EXISTS MovieReleases")

Connecting to MotherDuck...


## 2. Extract Logic (IMDb)
We use regex to find the `__NEXT_DATA__` JSON blob embedded in the IMDb page source.

In [2]:
def fetch_uk_releases_imdb():
    url = "https://www.imdb.com/calendar/?region=GB&type=MOVIE"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/123.0.0.0 Safari/537.36", "Accept-Language": "en-GB"}
    
    try:
        response = requests.get(url, headers=headers)
        pattern = r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>'
        match = re.search(pattern, response.text, flags=re.S)
        
        if not match: return pd.DataFrame()
        
        raw_json = json.loads(match.group(1))
        groups = raw_json.get('props', {}).get('pageProps', {}).get('groups', [])
        
        all_movies = []
        for group in groups:
            entries = group.get('entries', [])
            for entry in entries:
                # Parsing Logic
                title = entry.get('titleText', {}).get('text') if isinstance(entry.get('titleText'), dict) else entry.get('titleText')
                imdb_id = entry.get('id')
                
                # Date Parsing
                raw_date = entry.get('releaseDate')
                date_str = None
                if isinstance(raw_date, dict):
                    try:
                        date_str = f"{raw_date.get('year')}-{raw_date.get('month'):02d}-{raw_date.get('day'):02d}"
                    except: pass
                else:
                    date_str = str(raw_date)
                
                if title and date_str:
                    all_movies.append({
                        "uk_release_id": f"{imdb_id}_{date_str}",
                        "movie_title": title,
                        "release_date": date_str,
                        "imdb_id_ref": imdb_id
                    })
                    
        df = pd.DataFrame(all_movies)
        if df.empty: return df
        
        # Clean
        df['release_date'] = pd.to_datetime(df['release_date']).dt.date
        df = df[df['release_date'] >= datetime.now().date()]
        return df
        
    except Exception as e:
        print(f"Error: {e}")
        return pd.DataFrame()

## 3. Execution: Load to Landing
Ingest raw data into `MovieReleases.landing.uk_releases`. This overwrites the previous landing data.

In [None]:
dfReleases = fetch_uk_releases_imdb()

if not dfReleases.empty:
    f_load_to_landing(con, dfReleases, "MovieReleases.landing", "uk_releases")
    print("Landing load complete.")
else:
    print("No data found.")

con.close()

Loading 77 rows to Landing: MovieReleases.landing.uk_releases
Landing load complete.
