In [1]:
# q1: Top-3 movies with the highest ratings in 2016
with open('IMDB-Movie-Data.csv', 'r') as file:
    data = file.readlines()
def top_3_movies_2016(data):
    # Parse the data
    movies_2016 = []
    header = data[0].strip().split(',')
    year_index = header.index('Year')
    title_index = header.index('Title')
    rating_index = header.index('Rating')
    # find movies in 2016
    for row in data[1:]:
        row_data = row.strip().split(',')
        year = int(row_data[year_index])
        if year == 2016:
            title = row_data[title_index]
            rating = float(row_data[rating_index])
            movies_2016.append((title, rating))
    # sort and get the top 3 movies with the highest ratings
    movies_2016.sort(key=lambda x: x[1], reverse=True) # we want to sort by rating not it's title
    # print(movies_2016)
    top_3 = movies_2016[:3]
    for movie in top_3:
        print(f"{movie[0]} with a rating of {movie[1]}")

top_3_movies_2016(data)


Dangal with a rating of 8.8
Kimi no na wa with a rating of 8.6
Koe no katachi with a rating of 8.4


In [2]:
# q2: average revenue is calculated by the sum of the actor's total revenue and divide it by the number of movies they performed in and get 
# "the average revenue per movie" as the definition of "highest average revenue"
# since Daisy Ridley is only in one movie and the movie made a huge revenue of 936 million so she is the actor with highest revenue

with open('IMDB-Movie-Data.csv', 'r') as file:
    data = file.readlines()

def highest_avg_revenue_actor(data):
    actor_revenue = {}
    header = data[0].strip().split(',')  
    # find the indices of relevant columns
    actor_index = header.index('Actors')
    revenue_index = header.index('Revenue (Millions)')
    
    # iterate over each row after the header
    for row in data[1:]:
        row_data = row.strip().split(',')
        # try to extract revenue and handle missing or empty values
        try:
            revenue = float(row_data[revenue_index])
        except ValueError:
            continue
        actors = row_data[actor_index].split('|')
        actors = [i.strip(" ") for i in actors]
        # add the revenue to each actor's list of revenues
        for actor in actors:
            if actor in actor_revenue:
                actor_revenue[actor].append(revenue)
            else:
                actor_revenue[actor] = [revenue]
    # print(actor_revenue)
    # calculate the average revenue for each actor
    highest_avg_actor = None
    highest_avg_revenue = 0
    for actor, revenues in actor_revenue.items():
        avg_revenue = sum(revenues) / len(revenues)
        if avg_revenue > highest_avg_revenue:
            highest_avg_revenue = avg_revenue
            highest_avg_actor = actor
    print(f"The actor generating the highest average revenue is {highest_avg_actor} with an average revenue of {highest_avg_revenue:.2f} million.")

highest_avg_revenue_actor(data)


The actor generating the highest average revenue is Daisy Ridley with an average revenue of 936.63 million.


In [3]:
# q3: emma watson rating done
with open('IMDB-Movie-Data.csv', 'r') as file:
    data = file.readlines()

def average_rating_emma_watson(data):
    total_rating = 0
    movie_count = 0
    header = data[0].strip().split(',')
    actor_index = header.index('Actors')
    rating_index = header.index('Rating')
    # iterate over each row after the header
    for row in data[1:]:
        row_data = row.strip().split(',')
        actors = row_data[actor_index].split('|')
        actors = [i.strip(" ") for i in actors]
        if 'Emma Watson' in actors:
            # print(f'actors = {actors}')
            # try to extract the rating and handle missing or empty values
            try:
                rating = float(row_data[rating_index])
                total_rating += rating
                movie_count += 1
            except ValueError:
                continue
    # print(f'total rating = {total_rating}, movie count = {movie_count}')
    # calculate the average rating if Emma Watson's movies exist
    if movie_count > 0:
        average_rating = total_rating / movie_count
        print(f"The average rating of Emma Watson's movies is {average_rating:.3f}")
    else:
        print("No movies found for Emma Watson.")
        
average_rating_emma_watson(data)


The average rating of Emma Watson's movies is 7.175


In [4]:
# q4: top-3 directors who collaborated with most actors
# Danny Boyle and Paul W.S. Anderson actually collaborated with the same amount of unique actors of 20 people
with open('IMDB-Movie-Data.csv', 'r') as file:
    data = file.readlines()

def top_3_directors_most_actors(data):
    director_actors = {}
    header = data[0].strip().split(',')
    director_index = header.index('Director')
    actor_index = header.index('Actors')
    
    # iterate over each row after the header
    for row in data[1:]:
        row_data = row.strip().split(',')
        director = row_data[director_index]
        actors = row_data[actor_index].split('|')
        actors = [i.strip(" ") for i in actors]
        # ensure each director has a set of unique actors
        if director not in director_actors:
            director_actors[director] = set()
        # add actors to the director's set
        for actor in actors:
            director_actors[director].add(actor)

    # convert the director's actor set to the count of unique actors
    director_actor_count = [(director, len(actors)) for director, actors in director_actors.items()]
    # print(f'director_actor_count = {director_actor_count}')

    # sort directors by the number of unique actors in descending order
    director_actor_count.sort(key=lambda x: x[1], reverse=True)
    # print(f'director_actor_count = {director_actor_count}')
    
    top_3_directors = director_actor_count[:3]
    
    for director, actor_count in top_3_directors:
        print(f"{director} has collaborated with {actor_count} unique actors.")

top_3_directors_most_actors(data)


Ridley Scott has collaborated with 28 unique actors.
M. Night Shyamalan has collaborated with 24 unique actors.
Danny Boyle has collaborated with 20 unique actors.


In [5]:
# q5: top-2 actors who played in the most genres of movies
with open('IMDB-Movie-Data.csv', 'r') as file:
    data = file.readlines()

def top_2_actors_most_genres(data):
    actor_genres = {}
    
    header = data[0].strip().split(',')
    actor_index = header.index('Actors')
    genre_index = header.index('Genre')
    
    # iterate over each row after the header
    for row in data[1:]:
        row_data = row.strip().split(',')
        actors = row_data[actor_index].split('|')
        genres = row_data[genre_index].split('|')
        actors = [i.strip(" ") for i in actors]
        genres = [i.strip(" ") for i in genres]
        # add genres to each actor's set of genres
        for actor in actors:
            if actor not in actor_genres:
                actor_genres[actor] = set()
            for genre in genres:
                actor_genres[actor].add(genre)
    
    # convert the actor's genre set to the count of unique genres
    actor_genre_count = [(actor, len(genres)) for actor, genres in actor_genres.items()]
    
    # sort actors by the number of unique genres in descending order
    actor_genre_count.sort(key=lambda x: x[1], reverse=True)
    top_2_actors = actor_genre_count[:2]
    for actor, genre_count in top_2_actors:
        print(f"{actor} has acted in {genre_count} different genres.")

top_2_actors_most_genres(data)


Brad Pitt has acted in 14 different genres.
Hugh Jackman has acted in 13 different genres.


In [6]:
# q6: find the actors whose movies lead to the largest maximum gap of years
with open('IMDB-Movie-Data.csv', 'r') as file:
    data = file.readlines()

def actors_max_year_gap(data):
    actor_years = {}
    header = data[0].strip().split(',')
    actor_index = header.index('Actors')
    year_index = header.index('Year')
    
    # iterate over each row after the header
    for row in data[1:]:
        row_data = row.strip().split(',')
        actors = row_data[actor_index].split('|')
        actors = [i.strip(" ") for i in actors]
        year = int(row_data[year_index])
        # add the movie year to each actor's list of years
        for actor in actors:
            if actor not in actor_years:
                actor_years[actor] = []
            actor_years[actor].append(year)
    # print(f'actor_years = {actor_years}')

    # calculate the maximum year gap for each actor
    actor_max_gap = []
    max_gap_value = 0
    
    for actor, years in actor_years.items():
        if len(years) > 1:
            min_year = min(years)
            max_year = max(years)
            max_gap = max_year - min_year
            actor_max_gap.append((actor, max_gap))
            max_gap_value = max(max_gap_value, max_gap)
    
    # find all actors with the largest maximum gap
    actors_with_largest_gap = [actor for actor, gap in actor_max_gap if gap == max_gap_value]
    
    # output the result
    print(f"{len(actors_with_largest_gap)} Actors with the largest maximum gap of years ({max_gap_value} years):")
    for actor in actors_with_largest_gap:
        print(actor)
        
actors_max_year_gap(data)

53 Actors with the largest maximum gap of years (10 years):
Christian Bale
Anne Hathaway
Hugh Jackman
Scarlett Johansson
Matt Damon
Mark Wahlberg
Brad Pitt
Christopher Plummer
Tom Hanks
Bryce Dallas Howard
Chiwetel Ejiofor
Ben Kingsley
Gerard Butler
Eva Green
Judi Dench
Will Smith
Jennifer Connelly
Tom Cruise
Emily Blunt
Kevin Spacey
Samuel L. Jackson
Steve Carell
Edward Norton
Will Ferrell
Denzel Washington
Russell Crowe
Toni Collette
Meryl Streep
Morgan Freeman
Dominic West
Owen Wilson
Michelle Monaghan
Jessica Biel
Dustin Hoffman
Ben Whishaw
Paula Patton
Abbie Cornish
Johnny Depp
Jack Davenport
Rachel Weisz
Ellen Burstyn
Kang-ho Song
Jeremy Irons
Marion Cotillard
Kirsten Dunst
Jennifer Aniston
Justin Theroux
Maya Rudolph
Kate Bosworth
Audrey Tautou
Luke Wilson
Sacha Baron Cohen
Bob Balaban


In [7]:
# q7: all actors collaborating with Johnny Depp directly and indirectly
with open('IMDB-Movie-Data.csv', 'r') as file:
    data = file.readlines()

def find_collaborators_johnny_depp(data):
    collaborations = {}
    header = data[0].strip().split(',')
    actor_index = header.index('Actors')
    
    # iterate over each row after the header
    for row in data[1:]:
        row_data = row.strip().split(',')
        actors = row_data[actor_index].split('|')
        actors = [i.strip(" ") for i in actors]
        # build the graph by linking all actors who appear in the same movie
        for actor in actors:
            if actor not in collaborations:
                collaborations[actor] = set()
            for co_actor in actors:
                if co_actor != actor:
                    collaborations[actor].add(co_actor)
    
    # perform BFS to find all connected actors
    def bfs(start_actor, graph):
        visited = set()
        queue = [start_actor]
        visited.add(start_actor)
        
        while queue:
            current_actor = queue.pop(0)  # Dequeue the first actor
            for neighbor in graph.get(current_actor, []):
                if neighbor not in visited:
                    visited.add(neighbor)
                    queue.append(neighbor)
        return visited

    # call BFS starting from "Johnny Depp" to find all direct and indirect collaborators
    if "Johnny Depp" in collaborations:
        all_collaborators = bfs("Johnny Depp", collaborations)
        all_collaborators.remove("Johnny Depp")
        print(f"All {len(all_collaborators)} actors collaborated with Johnny Depp (directly or indirectly): {all_collaborators}")
    else:
        print("Johnny Depp is not in the dataset.")

find_collaborators_johnny_depp(data)


All 1574 actors collaborated with Johnny Depp (directly or indirectly): {'Liam Neeson', 'Armie Hammer', 'Malcolm Sinclair', 'Rooney Mara', 'Laura Ramsey', 'Teri Hatcher', 'Tabu', 'Rihanna', 'Tom Mison', 'Jared Harris', 'Kirsten Dunst', 'Matt Damon', 'Alison Sudol', 'Margot Robbie', 'Mickey Rourke', 'Billy Connolly', 'Brendan Fraser', 'Angourie Rice', 'Tim McGraw', 'Tijuana Ricks', 'Michael Shannon', 'Lucas Hedges', 'Wendi McLendon-Covey', 'Melissa Leo', 'Jasper Newell', 'Barry Ward', 'Michael Dickson', 'Eric Winter', 'Kagiso Kuypers', 'Nadine Velazquez', 'Rose Leslie', 'Adam Driver', 'Charlie Hunnam', 'Theo James', 'Aidan Turner', 'Genesis Rodriguez', 'Diego Luna', 'Marion Cotillard', 'Alexander Skarsgard', "De'Shawn Washington", 'Rachael Harris', 'Emma Greenwell', 'Ben Barnes', 'Casey Affleck', 'Malin Akerman', 'Jessica Brown Findlay', 'Tobey Maguire', 'Odeya Rush', 'Randall Park', 'Damien Marzette', 'Karen Allen', 'Adrianne Palicki', 'Samantha Isler', 'Emma Thompson', 'Kristen Stewar