## Create .csv 'database' from movie title input

In [None]:
# Author Greg Campbell - see MIT License in repository
# Written with assistance from ChatGPT (GPT-5.2)
# Using OMDb API - see https://www.omdbapi.com/apikey.aspx for details and to get your free API key
# Note only 1000 requests per day for free key

In [1]:
# !pip install XlsxWriter
# !pip install requests
# !pip install streamlit

In [2]:
import requests
import csv
import time
import re

In [None]:
# ======================================
# CONFIGURATION
# ======================================

API_KEY = "YOUR_OMDB_API_KEY"       # TODO - Replace with your actual OMDb API key
INPUT_FILE = "Example_movies.txt"   # Replace with your list of movies (one title per line)
OUTPUT_FILE = "movie_database.csv"
REQUEST_DELAY = 0.1                 # seconds


In [17]:
# API Test
def test_api():
    url = f"http://www.omdbapi.com/?t=Inception&apikey={API_KEY}"
    response = requests.get(url)
    data = response.json()
    if data.get("Response") == "True":
        print("API Key is valid.")
    else:
        raise ValueError("API Key is invalid or there was an error. Update your API key and try again.")

In [18]:
# Testing API - make sure you set up your API key...
test_api()

ValueError: API Key is invalid or there was an error. Update your API key and try again.

In [4]:
# ======================================
# HELPERS
# ======================================

def clean_title(title):
    return re.sub(r'[^a-z0-9]', '', title.lower())


def parse_title_and_year(raw_title):
    match = re.match(r"^(.*?)(?:\((\d{4})\))?$", raw_title.strip())
    title = match.group(1).strip()
    year = match.group(2) if match.group(2) else None
    return title, year


def is_imdb_id(value):
    return re.fullmatch(r"tt\d{7,8}", value.strip()) is not None


def expand_actors(movie_dict):
    actors_string = movie_dict.get("Actors", "")
    actors_list = [a.strip() for a in actors_string.split(",")]

    for i in range(10):
        key = f"Actor{i+1}"
        if i < len(actors_list):
            movie_dict[key] = actors_list[i]
        else:
            movie_dict[key] = ""

    return movie_dict


def validate_movie(detail_data, raw_title):
    # Runtime >= 30
    runtime_str = detail_data.get("Runtime", "").replace(" min", "")
    try:
        runtime = int(runtime_str)
        if runtime < 30:
            print(f"Skipping (too short): {raw_title}")
            return False
    except:
        print(f"Skipping (invalid runtime): {raw_title}")
        return False

    # Must have valid plot
    plot = detail_data.get("Plot", "").strip()
    if not plot or plot == "N/A":
        print(f"Skipping (no plot): {raw_title}")
        return False

    return True


# ======================================
# ROBUST FETCH
# ======================================

def fetch_movie_data(raw_input):
    base_url = "http://www.omdbapi.com/"
    raw_input = raw_input.strip()

    # ----------------------------------
    # CASE 1️⃣: IMDb ID Provided
    # ----------------------------------
    if is_imdb_id(raw_input):
        detail_params = {
            "apikey": API_KEY,
            "i": raw_input,
            "plot": "full"
        }

        response = requests.get(base_url, params=detail_params)
        detail_data = response.json()

        if detail_data.get("Response") == "False":
            print(f"Invalid IMDb ID: {raw_input}")
            return None

        if not validate_movie(detail_data, raw_input):
            return None

        return expand_actors(detail_data)

    # ----------------------------------
    # CASE 2️⃣: Title (with optional year)
    # ----------------------------------
    title, year = parse_title_and_year(raw_input)

    search_params = {
        "apikey": API_KEY,
        "s": title,
        "type": "movie"
    }

    search_response = requests.get(base_url, params=search_params)
    search_data = search_response.json()

    if search_data.get("Response") == "False":
        print(f"No search results: {raw_input}")
        return None

    results = search_data.get("Search", [])
    cleaned_input = clean_title(title)
    best_match = None

    for result in results:
        candidate_title = result.get("Title", "")
        cleaned_candidate = clean_title(candidate_title)

        if cleaned_candidate == cleaned_input:
            if year:
                if result.get("Year") == year:
                    best_match = result
                    break
            else:
                best_match = result
                break

    if not best_match:
        print(f"Using best guess for: {raw_input}")
        best_match = results[0]

    imdb_id = best_match["imdbID"]

    # Fetch full details
    detail_params = {
        "apikey": API_KEY,
        "i": imdb_id,
        "plot": "full"
    }

    detail_response = requests.get(base_url, params=detail_params)
    detail_data = detail_response.json()

    if detail_data.get("Response") == "False":
        print(f"Failed full fetch: {raw_input}")
        return None

    if not validate_movie(detail_data, raw_input):
        return None

    return expand_actors(detail_data)

## Main - output will notify of any titles that failed to load
#### Recommendation - import problem titles as IMDB ID's (search them on IMDB)

In [5]:
# ======================================
# MAIN
# ======================================

def main():
    with open(INPUT_FILE, "r", encoding="utf-8") as f:
        movie_inputs = [line.strip() for line in f if line.strip()]

    movie_data_list = []

    for entry in movie_inputs:
        # print(f"Processing: {entry}")
        data = fetch_movie_data(entry)

        if data:
            movie_data_list.append(data)

        time.sleep(REQUEST_DELAY)

    if not movie_data_list:
        print("No valid movie data collected.")
        return

    # Collect all keys
    all_keys = set()
    for movie in movie_data_list:
        all_keys.update(movie.keys())

    all_keys = sorted(all_keys)

    with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=all_keys)
        writer.writeheader()
        writer.writerows(movie_data_list)

    print(f"\n✅ Database saved to '{OUTPUT_FILE}'")


if __name__ == "__main__":
    main()


No search results: batman begins
No search results: The Dark Knight
No search results: the dark knight rises
No search results: casino royale
No search results: skyfall
No search results: quantum of solace
No search results: die another day
No search results: octopussy
No search results: moonraker
No search results: Dr. No
No search results: Tomorrow Never Dies
No search results: You Only Live Twice
No search results: Live and let die
No search results: for your eyes Only
No search results: goldeneye
No search results: on her majesty's secret service
No search results: from russia with love
No search results: the man with the golden gun
No search results: the world is not enough
No search results: the living daylights
No search results: goldfinger
No search results: diamonds are forever
No search results: Gamer
No search results: Bedazzled
No search results: Shrek
No search results: Shrek 2
No search results: The godfather
No search results: the godfather part ii
No search results: the

In [6]:
import pandas as pd

In [19]:
df = pd.read_csv(OUTPUT_FILE)
# sort by title
df = df.sort_values(by=["Title"])
df

Unnamed: 0,Actor1,Actor10,Actor2,Actor3,Actor4,Actor5,Actor6,Actor7,Actor8,Actor9,...,Response,Runtime,Title,Type,Website,Writer,Year,imdbID,imdbRating,imdbVotes
0,Jonah Hill,,Channing Tatum,Ice Cube,,,,,,,...,True,109 min,21 Jump Street,movie,,"Michael Bacall, Jonah Hill, Patrick Hasburgh",2012,tt1232829,7.2,633925
1,Joseph Gordon-Levitt,,Seth Rogen,Anna Kendrick,,,,,,,...,True,100 min,50/50,movie,,Will Reiser,2011,tt1306980,7.6,349615
2,Zooey Deschanel,,Joseph Gordon-Levitt,Geoffrey Arend,,,,,,,...,True,95 min,500 Days of Summer,movie,,"Scott Neustadter, Michael H. Weber",2009,tt1022603,7.6,608366
3,Justin Long,,Jonah Hill,Blake Lively,,,,,,,...,True,93 min,Accepted,movie,,"Mark Perez, Adam Cooper, Bill Collage",2006,tt0384793,6.4,143007
4,Evan Rachel Wood,,Jim Sturgess,Joe Anderson,,,,,,,...,True,133 min,Across the Universe,movie,,"Dick Clement, Ian La Frenais, Julie Taymor",2007,tt0445922,7.3,116796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176,John C. Reilly,,Jack McBrayer,Jane Lynch,,,,,,,...,True,101 min,Wreck-It Ralph,movie,,"Rich Moore, Phil Johnston, Jim Reardon",2012,tt1772341,7.7,491256
177,Sean Connery,,Akiko Wakabayashi,Mie Hama,,,,,,,...,True,117 min,You Only Live Twice,movie,,"Harold Jack Bloom, Roald Dahl, Ian Fleming",1967,tt0062512,6.8,124311
178,Gene Wilder,,Madeline Kahn,Marty Feldman,,,,,,,...,True,106 min,Young Frankenstein,movie,,"Gene Wilder, Mel Brooks, Mary Shelley",1974,tt0072431,8.0,179473
179,Jessica Chastain,,Joel Edgerton,Chris Pratt,,,,,,,...,True,157 min,Zero Dark Thirty,movie,,Mark Boal,2012,tt1790885,7.4,337638
