# IMDB Agent Prompt Testing Notebook

Use this notebook to test and refine prompts for the agent system.

In [12]:
import pandas as pd

df=pd.read_csv("../data/imdb_top_1000.csv")

In [16]:

def clean_gross(gross_str):
    """Convert gross string like '28,341,469' to float"""
    if pd.isna(gross_str) or gross_str == '':
        return None
    try:
        # Remove commas and convert to float
        return float(str(gross_str).replace(',', ''))
    except:
        return None


def clean_runtime(runtime_str):
    """Convert runtime string like '142 min' to integer"""
    if pd.isna(runtime_str) or runtime_str == '':
        return None
    try:
        # Extract number from string like '142 min'
        return int(str(runtime_str).replace(' min', '').strip())
    except:
        return None


def clean_year(year_str):
    """Convert year string to integer"""
    if pd.isna(year_str) or year_str == '':
        return None
    try:
        # Handle cases like '(2019)' or '2019'
        year_clean = str(year_str).replace('(', '').replace(')', '').strip()
        return int(year_clean)
    except:
        return None


def clean_meta_score(score):
    """Convert meta score to float"""
    if pd.isna(score) or score == '':
        return None
    try:
        return float(score)
    except:
        return None


def clean_votes(votes_str):
    """Convert votes string to integer"""
    if pd.isna(votes_str) or votes_str == '':
        return None
    try:
        # Remove commas
        return int(str(votes_str).replace(',', ''))
    except:
        return None


def build_metadata_dict(row):
    """
    Build metadata dictionary excluding None values.
    ChromaDB doesn't accept None values, so we only include fields with actual data.
    """
    metadata = {}

    # Always include these core fields
    metadata['Series_Title'] = str(row['Series_Title'])
    metadata['Genre'] = str(row['Genre'])
    metadata['Director'] = str(row['Director'])
    metadata['Overview'] = str(row['Overview'])

    # Only include optional fields if they have values
    if pd.notna(row['Released_Year']):
        metadata['Released_Year'] = int(row['Released_Year'])

    if pd.notna(row['IMDB_Rating']):
        metadata['IMDB_Rating'] = float(row['IMDB_Rating'])

    if pd.notna(row['Meta_score']):
        metadata['Meta_score'] = float(row['Meta_score'])

    if pd.notna(row['Runtime']):
        metadata['Runtime'] = int(row['Runtime'])

    if pd.notna(row['Gross']):
        metadata['Gross'] = float(row['Gross'])

    if pd.notna(row['Star1']):
        metadata['Star1'] = str(row['Star1'])

    if pd.notna(row['Star2']):
        metadata['Star2'] = str(row['Star2'])

    if pd.notna(row['Star3']):
        metadata['Star3'] = str(row['Star3'])

    if pd.notna(row['Star4']):
        metadata['Star4'] = str(row['Star4'])

    if pd.notna(row['No_of_Votes']):
        metadata['No_of_Votes'] = int(row['No_of_Votes'])

    if pd.notna(row['Certificate']):
        metadata['Certificate'] = str(row['Certificate'])

    return metadata


def load_and_clean_data(CSV_PATH):
    """Load CSV and clean data"""
    print(f"Loading data from {CSV_PATH}...")


    # Load CSV
    df = pd.read_csv(CSV_PATH)
    print(f"Loaded {len(df)} movies")

    # Clean data
    print("Cleaning data...")

    # Strip whitespace from string columns
    string_cols = df.select_dtypes(include=['object']).columns
    for col in string_cols:
        df[col] = df[col].str.strip() if df[col].dtype == 'object' else df[col]

    # Convert Gross to float
    df['Gross'] = df['Gross'].apply(clean_gross)

    # Convert Runtime to int
    df['Runtime'] = df['Runtime'].apply(clean_runtime)

    # Convert Released_Year to int
    df['Released_Year'] = df['Released_Year'].apply(clean_year)

    # Convert Meta_score to float
    df['Meta_score'] = df['Meta_score'].apply(clean_meta_score)

    # Convert No_of_Votes to int
    df['No_of_Votes'] = df['No_of_Votes'].apply(clean_votes)

    # Fill missing overviews with empty string
    df['Overview'] = df['Overview'].fillna('')

    # Convert IMDB_Rating to float (should already be, but ensure it)
    df['IMDB_Rating'] = pd.to_numeric(df['IMDB_Rating'], errors='coerce')

    print("Data cleaning complete!")
    print(f"Data types:\n{df.dtypes}")

    return df

In [17]:
cleaned_df=load_and_clean_data("../data/imdb_top_1000.csv")

Loading data from ../data/imdb_top_1000.csv...
Loaded 1000 movies
Cleaning data...
Data cleaning complete!
Data types:
Poster_Link       object
Series_Title      object
Released_Year    float64
Certificate       object
Runtime            int64
Genre             object
IMDB_Rating      float64
Overview          object
Meta_score       float64
Director          object
Star1             object
Star2             object
Star3             object
Star4             object
No_of_Votes        int64
Gross            float64
dtype: object


In [18]:
df=cleaned_df.copy()

In [19]:
# Question 1: When did The Matrix release?
result = df[df['Series_Title'].str.contains('Matrix', case=False, na=False)]
if not result.empty:
    year = result.iloc[0]['Released_Year']
    title = result.iloc[0]['Series_Title']
    print(f"Answer: {title} was released in {year}")
    result[['Series_Title', 'Released_Year']]
else:
    print("Movie not found")

Answer: The Matrix was released in 1999.0


## Question 2: Top 5 movies of 2019 by meta score

In [20]:
# Question 2: Top 5 movies of 2019 by meta score
result = df[
    (df['Released_Year'] == 2019) & 
    (df['Meta_score'].notna())
].nlargest(5, 'Meta_score')[
    ['Series_Title', 'Released_Year', 'Meta_score', 'IMDB_Rating', 'Director']
].reset_index(drop=True)

print("Top 5 movies of 2019 by Meta Score:")
result

Top 5 movies of 2019 by Meta Score:


Unnamed: 0,Series_Title,Released_Year,Meta_score,IMDB_Rating,Director
0,Gisaengchung,2019.0,96.0,8.6,Bong Joon Ho
1,Portrait de la jeune fille en feu,2019.0,95.0,8.1,Céline Sciamma
2,Marriage Story,2019.0,94.0,7.9,Noah Baumbach
3,The Irishman,2019.0,94.0,7.9,Martin Scorsese
4,Little Women,2019.0,91.0,7.8,Greta Gerwig


## Question 3: Top 7 comedy movies between 2010-2020 by IMDB rating

In [21]:
# Question 3: Top 7 comedy movies between 2010-2020 by IMDB rating
result = df[
    (df['Released_Year'] >= 2010) & 
    (df['Released_Year'] <= 2020) &
    (df['Genre'].str.contains('Comedy', case=False, na=False)) &
    (df['IMDB_Rating'].notna())
].nlargest(7, 'IMDB_Rating')[
    ['Series_Title', 'Released_Year', 'Genre', 'IMDB_Rating', 'Meta_score', 'Director']
].reset_index(drop=True)

print("Top 7 comedy movies (2010-2020) by IMDB Rating:")
result

Top 7 comedy movies (2010-2020) by IMDB Rating:


Unnamed: 0,Series_Title,Released_Year,Genre,IMDB_Rating,Meta_score,Director
0,Gisaengchung,2019.0,"Comedy, Drama, Thriller",8.6,96.0,Bong Joon Ho
1,The Intouchables,2011.0,"Biography, Comedy, Drama",8.5,57.0,Olivier Nakache
2,Chhichhore,2019.0,"Comedy, Drama",8.2,,Nitesh Tiwari
3,Green Book,2018.0,"Biography, Comedy, Drama",8.2,69.0,Peter Farrelly
4,"Three Billboards Outside Ebbing, Missouri",2017.0,"Comedy, Crime, Drama",8.2,88.0,Martin McDonagh
5,Klaus,2019.0,"Animation, Adventure, Comedy",8.2,65.0,Sergio Pablos
6,Queen,2013.0,"Adventure, Comedy, Drama",8.2,,Vikas Bahl


## Question 4: Top horror movies with meta score > 85 and IMDB rating > 8

In [22]:
# Question 4: Top horror movies with meta score > 85 and IMDB rating > 8
result = df[
    (df['Genre'].str.contains('Horror', case=False, na=False)) &
    (df['Meta_score'] > 85) &
    (df['IMDB_Rating'] > 8)
].sort_values('IMDB_Rating', ascending=False)[
    ['Series_Title', 'Released_Year', 'Genre', 'IMDB_Rating', 'Meta_score', 'Director']
].reset_index(drop=True)

print(f"Found {len(result)} horror movies with Meta Score > 85 and IMDB Rating > 8:")
result

Found 2 horror movies with Meta Score > 85 and IMDB Rating > 8:


Unnamed: 0,Series_Title,Released_Year,Genre,IMDB_Rating,Meta_score,Director
0,Psycho,1960.0,"Horror, Mystery, Thriller",8.5,97.0,Alfred Hitchcock
1,Alien,1979.0,"Horror, Sci-Fi",8.4,89.0,Ridley Scott


## Question 5: Top directors with movies grossing over $500M at least twice

In [23]:
# Question 5: Top directors with movies grossing over $500M at least twice
# Filter movies with gross > 500M
high_grossing = df[
    (df['Gross'].notna()) & 
    (df['Gross'] > 500_000_000)
].copy()

# Group by director and count
director_counts = high_grossing.groupby('Director').agg({
    'Series_Title': 'count',
    'Gross': 'max',
    'IMDB_Rating': 'max'
}).rename(columns={'Series_Title': 'count_500m', 'Gross': 'highest_gross', 'IMDB_Rating': 'max_rating'})

# Filter directors with at least 2 movies > 500M
directors_with_2plus = director_counts[director_counts['count_500m'] >= 2].sort_values('highest_gross', ascending=False)

# Get the top movie for each director
result_list = []
for director in directors_with_2plus.index:
    director_movies = high_grossing[high_grossing['Director'] == director].sort_values('Gross', ascending=False)
    top_movie = director_movies.iloc[0]
    result_list.append({
        'Director': director,
        'Movies_Over_500M': int(director_counts.loc[director, 'count_500m']),
        'Highest_Grossing_Movie': top_movie['Series_Title'],
        'Highest_Gross': top_movie['Gross'],
        'Year': top_movie['Released_Year']
    })

result = pd.DataFrame(result_list)
print(f"Directors with at least 2 movies grossing over $500M:")
result

Directors with at least 2 movies grossing over $500M:


Unnamed: 0,Director,Movies_Over_500M,Highest_Grossing_Movie,Highest_Gross,Year
0,Anthony Russo,2,Avengers: Endgame,858373000.0,2019.0
1,James Cameron,2,Avatar,760507625.0,2009.0


## Question 6: Top 10 movies with over 1M votes but lower gross earnings

In [24]:
# Question 6: Top 10 movies with over 1M votes but lower gross earnings
# Filter movies with > 1M votes
high_votes = df[
    (df['No_of_Votes'].notna()) & 
    (df['No_of_Votes'] > 1_000_000) &
    (df['Gross'].notna())
].copy()

# Sort by gross (ascending = lower gross first), then by votes (descending)
result = high_votes.sort_values(['Gross', 'No_of_Votes'], ascending=[True, False]).head(10)[
    ['Series_Title', 'Released_Year', 'No_of_Votes', 'Gross', 'IMDB_Rating', 'Director']
].reset_index(drop=True)

print("Top 10 movies with >1M votes but lower gross earnings:")
result

Top 10 movies with >1M votes but lower gross earnings:


Unnamed: 0,Series_Title,Released_Year,No_of_Votes,Gross,IMDB_Rating,Director
0,American History X,1998.0,1034705,6719864.0,8.5,Tony Kaye
1,Léon,1994.0,1035236,19501238.0,8.5,Luc Besson
2,Memento,2000.0,1125712,25544867.0,8.4,Christopher Nolan
3,The Shawshank Redemption,1994.0,2343110,28341469.0,9.3,Frank Darabont
4,Fight Club,1999.0,1854740,37030102.0,8.8,David Fincher
5,Goodfellas,1990.0,1020727,46836394.0,8.7,Martin Scorsese
6,The Prestige,2006.0,1190259,53089891.0,8.5,Christopher Nolan
7,The Godfather: Part II,1974.0,1129952,57300000.0,9.0,Francis Ford Coppola
8,Kill Bill: Vol. 1,2003.0,1000639,70099045.0,8.1,Quentin Tarantino
9,V for Vendetta,2005.0,1032749,70511035.0,8.2,James McTeigue


## Question 7: Comedy movies with death or dead people in the plot (using Overview column)

In [25]:
# Question 7: Comedy movies with death or dead people in the plot
# Filter comedy movies
comedy_movies = df[
    (df['Genre'].str.contains('Comedy', case=False, na=False)) &
    (df['Overview'].notna())
].copy()

# Search for death-related keywords in Overview
death_keywords = ['death', 'dead', 'dying', 'deceased', 'dies', 'died', 'killed', 'murder', 'suicide']
death_pattern = '|'.join(death_keywords)

result = comedy_movies[
    comedy_movies['Overview'].str.contains(death_pattern, case=False, na=False)
][['Series_Title', 'Released_Year', 'Genre', 'Overview', 'IMDB_Rating', 'Director']].reset_index(drop=True)

print(f"Found {len(result)} comedy movies with death themes:")
result

Found 23 comedy movies with death themes:


Unnamed: 0,Series_Title,Released_Year,Genre,Overview,IMDB_Rating,Director
0,"Three Billboards Outside Ebbing, Missouri",2017.0,"Comedy, Crime, Drama",A mother personally challenges the local autho...,8.2,Martin McDonagh
1,Gangs of Wasseypur,2012.0,"Action, Comedy, Crime",A clash between Sultan and Shahid Khan leads t...,8.2,Anurag Kashyap
2,Sholay,1975.0,"Action, Adventure, Comedy",After his family is murdered by a notorious an...,8.2,Ramesh Sippy
3,The Big Lebowski,1998.0,"Comedy, Crime, Sport","Jeff ""The Dude"" Lebowski, mistaken for a milli...",8.1,Joel Coen
4,Underground,1995.0,"Comedy, Drama, War",A group of Serbian socialists prepares for the...,8.1,Emir Kusturica
5,Secrets & Lies,1996.0,"Comedy, Drama","Following the death of her adoptive parents, a...",8.0,Mike Leigh
6,Young Frankenstein,1974.0,Comedy,An American grandson of the infamous scientist...,8.0,Mel Brooks
7,Stalag 17,1953.0,"Comedy, Drama, War",When two escaping American World War II prison...,8.0,Billy Wilder
8,Kind Hearts and Coronets,1949.0,"Comedy, Crime",A distant poor relative of the Duke D'Ascoyne ...,8.0,Robert Hamer
9,Arsenic and Old Lace,1942.0,"Comedy, Crime, Thriller",A writer of books on the futility of marriage ...,8.0,Frank Capra


## Question 8: Summarize movie plots of Steven Spielberg's top-rated sci-fi movies

In [26]:
# Question 8: Summarize movie plots of Steven Spielberg's top-rated sci-fi movies
# Filter Spielberg sci-fi movies
spielberg_scifi = df[
    (df['Director'] == 'Steven Spielberg') &
    (df['Genre'].str.contains('Sci-Fi', case=False, na=False)) &
    (df['IMDB_Rating'].notna())
].sort_values('IMDB_Rating', ascending=False)[
    ['Series_Title', 'Released_Year', 'IMDB_Rating', 'Meta_score', 'Overview']
].reset_index(drop=True)

print(f"Steven Spielberg's top-rated sci-fi movies ({len(spielberg_scifi)} found):")
print("\n" + "="*80)

for idx, row in spielberg_scifi.iterrows():
    print(f"\n{idx+1}. {row['Series_Title']} ({row['Released_Year']})")
    print(f"   IMDB Rating: {row['IMDB_Rating']}, Meta Score: {row['Meta_score']}")
    print(f"   Plot: {row['Overview']}")
    print("-"*80)

spielberg_scifi

Steven Spielberg's top-rated sci-fi movies (3 found):


1. Jurassic Park (1993.0)
   IMDB Rating: 8.1, Meta Score: 68.0
   Plot: A pragmatic paleontologist visiting an almost complete theme park is tasked with protecting a couple of kids after a power failure causes the park's cloned dinosaurs to run loose.
--------------------------------------------------------------------------------

2. E.T. the Extra-Terrestrial (1982.0)
   IMDB Rating: 7.8, Meta Score: 91.0
   Plot: A troubled child summons the courage to help a friendly alien escape Earth and return to his home world.
--------------------------------------------------------------------------------

3. Close Encounters of the Third Kind (1977.0)
   IMDB Rating: 7.6, Meta Score: 90.0
   Plot: Roy Neary, an electric lineman, watches how his quiet and ordinary daily life turns upside down after a close encounter with a UFO.
--------------------------------------------------------------------------------


Unnamed: 0,Series_Title,Released_Year,IMDB_Rating,Meta_score,Overview
0,Jurassic Park,1993.0,8.1,68.0,A pragmatic paleontologist visiting an almost ...
1,E.T. the Extra-Terrestrial,1982.0,7.8,91.0,A troubled child summons the courage to help a...
2,Close Encounters of the Third Kind,1977.0,7.6,90.0,"Roy Neary, an electric lineman, watches how hi..."


## Question 9: Movies before 1990 with police involvement in the plot

Note: This requires semantic search (similarity search), not just keyword matching. 
For pandas, we'll do a keyword-based search, but note that the actual implementation 
should use vector similarity search (ChromaDB) for better results.

In [27]:
# Question 9: Movies before 1990 with police involvement in the plot
# Filter movies before 1990
pre_1990 = df[
    (df['Released_Year'] < 1990) &
    (df['Released_Year'].notna()) &
    (df['Overview'].notna())
].copy()

# Search for police-related keywords (note: semantic search would be better)
police_keywords = ['police', 'cop', 'detective', 'officer', 'sheriff', 'law enforcement', 
                   'investigation', 'investigator', 'fbi', 'cia', 'interrogation', 'arrest']

police_pattern = '|'.join(police_keywords)

result = pre_1990[
    pre_1990['Overview'].str.contains(police_pattern, case=False, na=False)
][['Series_Title', 'Released_Year', 'Overview', 'IMDB_Rating', 'Genre', 'Director']].reset_index(drop=True)

print(f"Found {len(result)} movies before 1990 with police involvement (keyword-based search):")
print("Note: For better results, use semantic similarity search (ChromaDB) instead of keyword matching")
result

Found 44 movies before 1990 with police involvement (keyword-based search):
Note: For better results, use semantic similarity search (ChromaDB) instead of keyword matching


Unnamed: 0,Series_Title,Released_Year,Overview,IMDB_Rating,Genre,Director
0,Apocalypse Now,1979.0,A U.S. Army officer serving in Vietnam is task...,8.4,"Drama, Mystery, War",Francis Ford Coppola
1,Dr. Strangelove or: How I Learned to Stop Worr...,1964.0,An insane general triggers a path to nuclear h...,8.4,Comedy,Stanley Kubrick
2,Paths of Glory,1957.0,"After refusing to attack an enemy position, a ...",8.4,"Drama, War",Stanley Kubrick
3,Aliens,1986.0,Fifty-seven years after surviving an apocalypt...,8.3,"Action, Adventure, Sci-Fi",James Cameron
4,Lawrence of Arabia,1962.0,"The story of T.E. Lawrence, the English office...",8.3,"Adventure, Biography, Drama",David Lean
5,Vertigo,1958.0,A former police detective juggles wrestling wi...,8.3,"Mystery, Romance, Thriller",Alfred Hitchcock
6,Double Indemnity,1944.0,An insurance representative lets himself be ta...,8.3,"Crime, Drama, Film-Noir",Billy Wilder
7,M - Eine Stadt sucht einen Mörder,1931.0,When the police in a German city are unable to...,8.3,"Crime, Mystery, Thriller",Fritz Lang
8,Die Hard,1988.0,An NYPD officer tries to save his wife and sev...,8.2,"Action, Thriller",John McTiernan
9,Sholay,1975.0,After his family is murdered by a notorious an...,8.2,"Action, Adventure, Comedy",Ramesh Sippy


## Nice-to-Have Question 1: Al Pacino movies with >$50M gross and IMDB rating >= 8

This question requires checking if Al Pacino is in Star1, Star2, Star3, or Star4 columns.

## Nice-to-Have Question 2: Find similar movies based on Meta score and IMDB rating

This function finds movies with similar ratings to a given movie.

In [28]:
# Nice-to-Have 2: Find similar movies based on Meta score and IMDB rating
def find_similar_movies(movie_title, n_similar=5, rating_tolerance=0.5, meta_tolerance=5):
    """
    Find movies with similar Meta score and IMDB rating to a given movie.
    
    Parameters:
    - movie_title: Name of the movie to find similarities for
    - n_similar: Number of similar movies to return
    - rating_tolerance: Tolerance for IMDB rating difference (default 0.5)
    - meta_tolerance: Tolerance for Meta score difference (default 5)
    """
    # Find the target movie
    target = df[df['Series_Title'].str.contains(movie_title, case=False, na=False)]
    
    if target.empty:
        return f"Movie '{movie_title}' not found"
    
    target_movie = target.iloc[0]
    target_rating = target_movie['IMDB_Rating']
    target_meta = target_movie['Meta_score']
    
    # Filter movies with similar ratings
    similar = df[
        (df['Series_Title'] != target_movie['Series_Title']) &  # Exclude the movie itself
        (df['IMDB_Rating'].notna()) &
        (df['Meta_score'].notna()) &
        (abs(df['IMDB_Rating'] - target_rating) <= rating_tolerance) &
        (abs(df['Meta_score'] - target_meta) <= meta_tolerance)
    ].copy()
    
    # Calculate similarity score (lower is better)
    similar['Rating_Diff'] = abs(similar['IMDB_Rating'] - target_rating)
    similar['Meta_Diff'] = abs(similar['Meta_score'] - target_meta)
    similar['Similarity_Score'] = similar['Rating_Diff'] + (similar['Meta_Diff'] / 10)
    
    result = similar.nsmallest(n_similar, 'Similarity_Score')[
        ['Series_Title', 'Released_Year', 'IMDB_Rating', 'Meta_score', 
         'Rating_Diff', 'Meta_Diff', 'Genre', 'Director']
    ].reset_index(drop=True)
    
    print(f"Movies similar to '{target_movie['Series_Title']}' ({target_movie['Released_Year']}):")
    print(f"Target: IMDB {target_rating}, Meta {target_meta}")
    print(f"\nFound {len(result)} similar movies:\n")
    
    return result

# Example: Find movies similar to "The Dark Knight"
similar_to_dark_knight = find_similar_movies("The Dark Knight", n_similar=5)
similar_to_dark_knight

Movies similar to 'The Dark Knight' (2008.0):
Target: IMDB 9.0, Meta 84.0

Found 5 similar movies:



Unnamed: 0,Series_Title,Released_Year,IMDB_Rating,Meta_score,Rating_Diff,Meta_Diff,Genre,Director
0,Forrest Gump,1994.0,8.8,82.0,0.2,2.0,"Drama, Romance",Robert Zemeckis
1,One Flew Over the Cuckoo's Nest,1975.0,8.7,83.0,0.3,1.0,Drama,Milos Forman
2,The Silence of the Lambs,1991.0,8.6,85.0,0.4,1.0,"Crime, Drama, Thriller",Jonathan Demme
3,Seppuku,1962.0,8.6,85.0,0.4,1.0,"Action, Drama, Mystery",Masaki Kobayashi
4,Star Wars: Episode V - The Empire Strikes Back,1980.0,8.7,82.0,0.3,2.0,"Action, Adventure, Fantasy",Irvin Kershner
