In [2]:
import pandas as pd
import networkx as nx
df = pd.read_csv('IMDB-Movie-Data.csv')
df['Revenue (Millions)'] = df['Revenue (Millions)'].fillna(0)
# split Actors and Genre into lists
df['Actors'] = df['Actors'].fillna('').apply(lambda x: [actor.strip() for actor in x.split('|') if actor.strip()])
df['Genre'] = df['Genre'].fillna('').apply(lambda x: [genre.strip() for genre in x.split('|') if genre.strip()])


In [4]:
# Question 1: Top-3 movies with the highest ratings in 2016
def top_3_movies_2016(df):
    movies_2016 = df[df['Year'] == 2016]
    top_movies = movies_2016.sort_values(by='Rating', ascending=False).head(3)
    return top_movies[['Title', 'Rating']]
top_3_movies_2016(df)

Unnamed: 0,Title,Rating
2,Dangal,8.8
4,Kimi no na wa,8.6
15,Koe no katachi,8.4


In [5]:
# Question 2: The actor generating the highest average revenue
def actor_highest_avg_revenue(df):
    # explode the Actors column to have one actor per row
    actors_revenue = df.explode('Actors')
    # group by Actors and calculate average revenue
    avg_revenue = actors_revenue.groupby('Actors')['Revenue (Millions)'].mean()
    # get the actor with the highest average revenue
    top_actor = avg_revenue.idxmax()
    return top_actor, avg_revenue.max()
actor_highest_avg_revenue(df)

('Daisy Ridley', 936.63)

In [11]:
# Question 3: The average rating of Emma Watsonâ€™s movies
def average_rating_emma_watson(df):
    emma_movies = df[df['Actors'].apply(lambda actors: 'Emma Watson' in actors)]
    avg_rating = emma_movies['Rating'].mean()
    return f'{avg_rating:.4f}'
average_rating_emma_watson(df)

'7.1750'

In [12]:
# Question 4: Top-3 directors who collaborate with the most actors
def top_3_directors_most_actors(df):
    # explode the Actors column
    director_actors = df.explode('Actors')
    # group by Director and count unique actors
    director_actor_count = director_actors.groupby('Director')['Actors'].nunique()
    # sort and get top 3
    top_directors = director_actor_count.sort_values(ascending=False).head(3)
    return top_directors
top_3_directors_most_actors(df)

Director
Ridley Scott          28
M. Night Shyamalan    24
Paul W.S. Anderson    20
Name: Actors, dtype: int64

In [13]:
# Question 5: Top-2 actors playing in the most genres of movies
def top_2_actors_most_genres(df):
    # explode Actors and Genre
    actors_genres = df.explode('Actors').explode('Genre')
    # remove empty entries
    actors_genres = actors_genres[actors_genres['Actors'] != '']
    actors_genres = actors_genres[actors_genres['Genre'] != '']
    # group by Actor and count unique genres
    actor_genre_count = actors_genres.groupby('Actors')['Genre'].nunique()
    # sort and get top 2
    top_actors = actor_genre_count.sort_values(ascending=False).head(2)
    return top_actors
top_2_actors_most_genres(df)

Actors
Brad Pitt       14
Hugh Jackman    13
Name: Genre, dtype: int64

In [20]:
# Question 6: Actors with the maximum gap of years
def actors_with_max_gap(df):
    # explode Actors
    actors_years = df.explode('Actors')[['Actors', 'Year']]
    # remove empty actor entries
    actors_years = actors_years[actors_years['Actors'] != '']
    # group by Actor and calculate max gap
    actor_year_group = actors_years.groupby('Actors')['Year']
    actor_max_gap = actor_year_group.max() - actor_year_group.min()
    # find the maximum gap
    max_gap = actor_max_gap.max()
    # get all actors with the maximum gap
    actors_with_max_gap = actor_max_gap[actor_max_gap == max_gap].index.tolist()
    print(f'there are {len(actors_with_max_gap)} actors with {max_gap} gap of years')
    return sorted(actors_with_max_gap)
actors_with_max_gap(df)

there are 53 actors with 10 gap of years


['Abbie Cornish',
 'Anne Hathaway',
 'Audrey Tautou',
 'Ben Kingsley',
 'Ben Whishaw',
 'Bob Balaban',
 'Brad Pitt',
 'Bryce Dallas Howard',
 'Chiwetel Ejiofor',
 'Christian Bale',
 'Christopher Plummer',
 'Denzel Washington',
 'Dominic West',
 'Dustin Hoffman',
 'Edward Norton',
 'Ellen Burstyn',
 'Emily Blunt',
 'Eva Green',
 'Gerard Butler',
 'Hugh Jackman',
 'Jack Davenport',
 'Jennifer Aniston',
 'Jennifer Connelly',
 'Jeremy Irons',
 'Jessica Biel',
 'Johnny Depp',
 'Judi Dench',
 'Justin Theroux',
 'Kang-ho Song',
 'Kate Bosworth',
 'Kevin Spacey',
 'Kirsten Dunst',
 'Luke Wilson',
 'Marion Cotillard',
 'Mark Wahlberg',
 'Matt Damon',
 'Maya Rudolph',
 'Meryl Streep',
 'Michelle Monaghan',
 'Morgan Freeman',
 'Owen Wilson',
 'Paula Patton',
 'Rachel Weisz',
 'Russell Crowe',
 'Sacha Baron Cohen',
 'Samuel L. Jackson',
 'Scarlett Johansson',
 'Steve Carell',
 'Tom Cruise',
 'Tom Hanks',
 'Toni Collette',
 'Will Ferrell',
 'Will Smith']

In [16]:
# Question 7: Find all actors who collaborate with Johnny Depp directly and indirectly
def actors_collaborated_with_johnny_depp(df):
    # create a graph where nodes are actors and edges exist if they have acted together
    G = nx.Graph()
    
    # iterate through each movie and add edges between all pairs of actors in that movie
    for actors in df['Actors']:
        actors = [actor.strip() for actor in actors if actor.strip()]
        for i in range(len(actors)):
            for j in range(i + 1, len(actors)):
                G.add_edge(actors[i], actors[j])
    
    # find all actors connected to Johnny Depp
    if 'Johnny Depp' not in G:
        return []
    collaborators = nx.node_connected_component(G, 'Johnny Depp')
    collaborators.remove('Johnny Depp')  # Remove Johnny Depp from the list
    print(f'there are {len(collaborators)} actors collaborated with johnny depp')
    return sorted(collaborators)
actors_collaborated_with_johnny_depp(df)

there are 1574 actors collaborated with johnny depp


['50 Cent',
 '?lafur Darri ?lafsson',
 '?scar Jaenada',
 'AJ Michalka',
 'Aaron Burns',
 'Aaron Eckhart',
 'Aaron Paul',
 'Aaron Taylor-Johnson',
 'Aaron Yoo',
 'Aasif Mandvi',
 'Abbie Cornish',
 'Abigail Breslin',
 'Adam Brody',
 'Adam Devine',
 'Adam Driver',
 'Adam Levine',
 'Adam Pally',
 'Adam Rodriguez',
 'Adam Sandler',
 'Addison Timlin',
 'Adelaide Kane',
 'Adele Exarchopoulos',
 'Adewale Akinnuoye-Agbaje',
 'Adil Hussain',
 'Adria Arjona',
 'Adrian Grenier',
 'Adrian Martinez',
 'Adrianne Palicki',
 'Adrien Brody',
 'Aidan Gillen',
 'Aidan Quinn',
 'Aidan Turner',
 'Aiden Longworth',
 'Akiva Schaffer',
 'Alain Moussi',
 'Alan Alda',
 'Alan Arkin',
 'Alan Rickman',
 'Alan Tudyk',
 'Albert Brooks',
 'Albert Finney',
 'Alden Ehrenreich',
 'Aldis Hodge',
 'Alec Baldwin',
 'Alessandro Nivola',
 'Alex Essoe',
 'Alex Fisher',
 'Alex Pettyfer',
 'Alex R. Hibbert',
 'Alexander Black',
 'Alexander Ludwig',
 'Alexander Skarsgard',
 'Alexandra Daddario',
 'Alexandra Maria Lara',
 'Alexis 