In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import ast
import os
from IPython.display import display, Image

In [2]:
def year_from_title(dataframe, title):
    df_row = dataframe[dataframe['Title'] == title]
    year = df_row['Released'].to_list()
    
    if len(year) == 0:
        print("The movie is not in the dataset or has a different name")
        return 0
    
    return int(year[0][-4:])

def get_poster_id(dataframe, title):
    df_row = dataframe[dataframe['Title'] == title]
    poster_id = df_row["Poster"].to_list()
    return poster_id[0]

def show_image(dataframe, movie_title):
    
    year = str(year_from_title(dataframe, movie_title))
    
    if year == "0":
        return 0
    
    poster_id = get_poster_id(dataframe, movie_title)
    
    image_directory = "movie_poster_per_year/"+year

    # Get a list of all JPEG files in the directory
    image_files = [f for f in os.listdir(image_directory) if f == poster_id]

    for image_file in image_files:
        image_path = os.path.join(image_directory, image_file)
        display(Image(filename=image_path))

**Take data from csv and filter useful information**

In [3]:
movie_df = pd.read_csv("complete_data_movie_with_correct_actor.csv") 

In [4]:
movie_df["Actors"] = movie_df["Actors"].map(lambda x: ast.literal_eval(x))
movie_df["Director"] = movie_df["Director"].map(lambda x: ast.literal_eval(x))

In [5]:
unique_directors = set()
unique_actors = set()
unique_movie = movie_df["Title"].unique()

for director_list in movie_df['Director']:
    for director in director_list:
        unique_directors.add(director)
        
for actors_list in movie_df['Actors']:
    for actor in actors_list:
        unique_actors.add(actor.strip())

unique_actors = list(unique_actors) 
unique_directors = list(unique_directors)

print(f'Number of movies : {len(unique_movie)}')
print(f'Number of directors : {len(unique_directors)}')
print(f'Number of actors : {len(unique_actors)}')

Number of movies : 6012
Number of directors : 3499
Number of actors : 9956


**Create matrix**

In [6]:
# Create dictionary of actors and their IDs
actors_dict = {actor: i for i, actor in enumerate(unique_actors)}
movies_dict = {movie:i for i, movie in enumerate(unique_movie)}
directors_dict = {director: i for i, director in enumerate(unique_directors)}

# Create new column 'Actors_id'
movie_df['Actors_id'] = movie_df['Actors'].apply(lambda x: [actors_dict[actor] for actor in x])
movie_df['Director_id'] = movie_df['Director'].apply(lambda x: [directors_dict[director] for director in x])

movie_df = movie_df.reset_index()

In [7]:
actor_to_actor_matrix = np.zeros((len(unique_actors),len(unique_actors)))
director_to_actor_matrix = np.zeros((len(unique_directors),len(unique_actors)))

In [8]:
for i in range(len(movie_df)):
    for j in range(len(movie_df['Director_id'][i])-1):
        for k in range(j+1,len(movie_df['Actors_id'][i])-1):
            director_to_actor_matrix[movie_df['Director_id'][i][j]][movie_df['Actors_id'][i][k]] = director_to_actor_matrix[movie_df['Director_id'][i][j]][movie_df['Actors_id'][i][k]]+ 1


In [9]:
for i in range(len(movie_df)):
    for j in range(len(movie_df['Actors_id'][i])-1):
        for k in range(j+1,len(movie_df['Actors_id'][i])-1):
            actor_to_actor_matrix[movie_df['Actors_id'][i][j]][movie_df['Actors_id'][i][k]] = actor_to_actor_matrix[movie_df['Actors_id'][i][j]][movie_df['Actors_id'][i][k]]+ 1
            actor_to_actor_matrix[movie_df['Actors_id'][i][k]][movie_df['Actors_id'][i][j]] = actor_to_actor_matrix[movie_df['Actors_id'][i][k]][movie_df['Actors_id'][i][j]]+ 1

In [10]:
np.max(actor_to_actor_matrix)

15.0

In [12]:
actor_to_actor_matrix.shape

(9956, 9956)

In [7]:
def played_with(df, input_actor):
    ret = set()
    actors_lists = df[df['Actors'].apply(lambda x: input_actor in x)]["Actors"].values # [ [actors film 1], [actors film 2] ...]
    for film_actors in actors_lists:
        for actor in film_actors:
            ret.add(actor)
    
    ret.remove(input_actor)
    
    return list(ret)

def shared_movies(df, actor1, actor2):
    ret = set()
    for movie in movie_df[(movie_df['Actors'].apply(lambda x: actor1 in x)) & (movie_df['Actors'].apply(lambda x: actor2 in x))]["Title"].values:
        ret.add(movie)
    return list(ret)

def own_movies(df, actor):
    ret = set()
    for movie in [title for title in df[df['Actors'].apply(lambda x: actor in x)]["Title"].values]:
        ret.add(movie)
    return list(ret)
        


In [15]:
played_with(movie_df, "Brad Pitt")

['Topher Grace',
 'Jessica Chastain',
 'Sam Shepard',
 'Harvey Keitel',
 'Vyto Ruginis',
 'Max Casella',
 'Michael Bowen',
 'Terry Kinney',
 'Eugenie Bondurant',
 'Jonathan Howard',
 'Gael Garcia Bernal',
 'Kevin Costner',
 'Paul Ben-Victor',
 'Holland Taylor',
 'Brenda Blethyn',
 'Indra Ove',
 'Jonathan Tucker',
 'Julie Dreyfus',
 'Michael Fassbender',
 'Julie Christie',
 'Kerry Washington',
 'Deborah Shelton',
 'Frances McDormand',
 'Saffron Burrows',
 'David Leitch',
 'Ben Stiller',
 'Nicholas Pryor',
 'David Duchovny',
 'Nick Searcy',
 'Mark Ivanir',
 'Mitchell Ryan',
 'Al Pacino',
 'Rinko Kikuchi',
 'Reed Diamond',
 'Kingsley Ben-Adir',
 'Don Cheadle',
 'James Gandolfini',
 'Dale Dye',
 'Helen McCrory',
 'Mako',
 'Jami Gertz',
 'Julia Roberts',
 'Bruno Kirby',
 'Phyllis Somerville',
 'Christian Bale',
 'Thandiwe Newton',
 'Tom Skerritt',
 'Dennis Haysbert',
 'Lea Seydoux',
 'Tracy Letts',
 'Cherry Jones',
 'Bill Pullman',
 'Mindy Sterling',
 'Minnie Driver',
 'James Badge Dale',
 

In [20]:
actor_to_actor_dict = {}
actors_test = []
for actor in tqdm(unique_actors[:10]):
    actors_test.append(actor)
    actor_id = actors_dict.get(actor)
    played_with_actor_list = played_with(movie_df, actor)
    tmp_dict = {}
    for played_with_actor in played_with_actor_list:
        shared_movies_list_id = list(map(lambda x: movies_dict.get(x), shared_movies(movie_df, actor, played_with_actor)))
        
        tmp_dict[actors_dict.get(played_with_actor)] = shared_movies_list_id
    
    tmp_dict["Played_with_ids"] = list(map(lambda x: actors_dict.get(x), played_with_actor_list))
    tmp_dict["Own_movies"] = list(map(lambda x: movies_dict.get(x), own_movies(movie_df, actor)))
    
    actor_to_actor_dict[actor_id] = tmp_dict

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  6.68it/s]


In [18]:
actor_to_actor_dict

{0: {7692: [2864],
  2405: [2765],
  2662: [2765],
  7113: [2765],
  1812: [5565],
  8127: [2765],
  258: [5565],
  7814: [2765],
  8792: [5565],
  814: [2864],
  3605: [2765],
  2375: [5565],
  5470: [2765],
  8678: [2864],
  9612: [2864],
  5971: [5565],
  9654: [5565],
  7390: [2864],
  3231: [2864],
  8888: [2765],
  4878: [2864],
  2480: [2765],
  3617: [2864],
  5896: [5565],
  8499: [2765],
  1839: [5565],
  5027: [2396],
  3090: [2396],
  'Played_with_ids': [7692,
   2405,
   2662,
   7113,
   1812,
   8127,
   258,
   7814,
   8792,
   814,
   3605,
   2375,
   5470,
   8678,
   9612,
   5971,
   9654,
   7390,
   3231,
   8888,
   4878,
   2480,
   3617,
   5896,
   8499,
   1839,
   5027,
   3090],
  'Own_movies': [2765, 5565, 2864, 2396]},
 1: {3587: [5112],
  9881: [5112],
  9521: [5112],
  'Played_with_ids': [3587, 9881, 9521],
  'Own_movies': [5112]},
 2: {8822: [275],
  2784: [211],
  6847: [275],
  1513: [275],
  9814: [275],
  2906: [211],
  9330: [211],
  458: [275],

In [19]:
 import json
with open('test_small_sample.json', 'w') as fp:
    json.dump(actor_to_actor_dict, fp)

In [14]:
with open('test_small_map_actors.json', 'w') as fp:
    json.dump(actors_dict, fp)

with open('test_small_map_movies.json', 'w') as fp:
    json.dump(movies_dict, fp)

In [22]:
actors_test

['Erica Leerhsen',
 'Don McLeroy',
 'John Hughes',
 'Eva Hayman',
 'Daniel Dae Kim',
 'Melanie Thierry',
 'Wrenn Schmidt',
 'Madeline Zima',
 'Elizabeth Berrington',
 'Wendie Malick']

# Find movie with closest cast 

In [None]:
def intersection(lst1, lst2):
    return len(list(set(lst1) & set(lst2)))

def actor_intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

# Define a function to find the movie with the greatest intersection of actors
def find_movie_with_greatest_intersection(movie_name, df):
    movie = df[df['Title'] == movie_name]  # Get the movie with the given title
    movie_actors = movie['Actors'].iloc[0]  # Get the list of actors in the movie
    movie_index = movie.index[0]  # Get the index of the movie
    
    # Find the intersection of the movie actors with each other movie in the dataframe
    intersections = []
    for i in df.index:
        if i != movie_index:
            intersection_value = intersection(movie_actors, df.loc[i, 'Actors'])
            if intersection_value:
                intersections.append((intersection_value,i,actor_intersection(movie_actors, df.loc[i, 'Actors'])))
    
    max_intersect = max(intersections, key=lambda tup: tup[0], default=(0,0))
    if(max_intersect[0] >1 ):
        return (df['Title'].loc[max_intersect[1]],max_intersect[2])
    else:
        return "Sorry, no movie with close cast"

# Find actor/director that worked with most actors and on most movies

In [None]:
def get_n_rows_with_most_ones(M, n):
    # Calculate the number of 1s in each row
    row_sums = np.sum(M, axis=1)
    # Sort the row indices in descending order based on their number of 1s
    sorted_rows = np.argsort(row_sums)[::-1]
    # Return the top n rows with the most 1s
    return sorted_rows[:n]

def most_common_values(df, column_name, n):
    return df[column_name].explode().value_counts().nlargest(n).index.tolist()


In [None]:
directors_most_actors = []
for actor_id in list(get_n_rows_with_most_ones(director_to_actor_matrix,100)):
    directors_most_actors.append(unique_directors[actor_id])
    
directors_most_actors[:10]

In [None]:
directors_most_movies = []
directors_most_movies = most_common_values(movie_df,'Director',100)

directors_most_movies[:10]

In [None]:
actors_most_movies = []
actors_most_movies = most_common_values(movie_df,'Actors',500)

actors_most_movies[:10]

In [None]:
actors_most_actors = []
for actor_id in list(get_n_rows_with_most_ones(actor_to_actor_matrix,500)):
    actors_most_actors.append(unique_actors[actor_id])
    
actors_most_actors[:10]

# Filtered Dataset with only movies with famous actor or sufficient box office

In [None]:
movie_df['Box_office'] = movie_df['Box_office'].fillna(0).str.replace(',', '')
movie_df['Box_office'] = movie_df['Box_office'].astype(str).apply(lambda x: int(x) if x.isnumeric() else 0)

threshold = 100000000 # replace with your desired value

filtered_df = movie_df[movie_df['Actors'].apply(lambda x: any(item for item in x if item in actors_most_actors)) | (movie_df['Box_office'] >= threshold) | movie_df['Director'].apply(lambda x: any(item for item in x if item in directors_most_movies))].reset_index()
filtered_df = filtered_df.drop(['index', 'level_0'], axis=1) 


In [None]:

threshold = 100000000 # replace with your desired value

filtered_df = movie_df[movie_df['Actors'].apply(lambda x: any(item for item in x if item in actors_most_actors)) | (movie_df['Box_office'] >= threshold) | movie_df['Director'].apply(lambda x: any(item for item in x if item in directors_most_movies))].reset_index()
filtered_df = filtered_df.drop(['index', 'level_0'], axis=1) 

# Query

In [None]:
def display_row(df, row_name, name):
    display(filtered_df[filtered_df[row_name].apply(lambda x:name in x)])

In [None]:
display_row(filtered_df, "Title", "Ocean")

In [None]:
def display_actor(df, actor):
    display(filtered_df[filtered_df['Actors'].apply(lambda x: actor in x)])

In [None]:
display_actor(filtered_df, "Emma Watson")

In [None]:
filtered_df[filtered_df['Director'].apply(lambda x: "Quentin Tarantino" in x)]


In [None]:
filtered_df

In [None]:
filtered_unique_actors = set()
for actors_list in filtered_df['Actors']:
    for actor in actors_list:
        filtered_unique_actors.add(actor)
        
filter_unique_actors = list(filtered_unique_actors)