In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import os
from IPython.display import display, Image

In [2]:
def year_from_title(dataframe, title):
    df_row = dataframe[dataframe['Title'] == title]
    year = df_row['Released'].to_list()
    
    if len(year) == 0:
        print("The movie is not in the dataset or has a different name")
        return 0
    
    return int(year[0][-4:])

def get_poster_id(dataframe, title):
    df_row = dataframe[dataframe['Title'] == title]
    poster_id = df_row["Poster"].to_list()
    return poster_id[0]

def show_image(dataframe, movie_title):
    
    year = str(year_from_title(dataframe, movie_title))
    
    if year == "0":
        return 0
    
    poster_id = get_poster_id(dataframe, movie_title)
    
    image_directory = "movie_poster_per_year/"+year

    # Get a list of all JPEG files in the directory
    image_files = [f for f in os.listdir(image_directory) if f == poster_id]

    for image_file in image_files:
        image_path = os.path.join(image_directory, image_file)
        display(Image(filename=image_path))

**Take data from csv and filter useful information**

In [6]:
movie_df = pd.read_csv("complete_data_movie_with_correct_actor2.csv") 

In [10]:
movie_df[movie_df["Title"] == "Ocean's Eleven"]["Actors"]


2495    ['Brad Pitt', 'George Clooney', 'Julia Roberts...
Name: Actors, dtype: object

In [11]:
movie_df

Unnamed: 0,Title,Genre,Actors,Director,Writer,Language,Box_office,Country,Released,Runtime,imdbRating,Awards,Poster
0,Stir Crazy,"Comedy, Crime","['Gene Wilder', 'Richard Pryor', 'JoBeth Willi...",['Sidney Poitier'],Bruce Jay Friedman,English,101500000,USA,12 Dec 1980,111 min,6.8,1 nomination.,tt0081562.jpg
1,Airplane!,Comedy,"['Leslie Nielsen', 'Lloyd Bridges', 'Robert Ha...","['Jim Abrahams', 'David Zucker', 'Jerry Zucker']","Jim Abrahams (written for the screen by), Davi...",English,83400000,USA,02 Jul 1980,88 min,7.8,Nominated for 1 Golden Globe. Another 2 wins &...,tt0080339.jpg
2,Private Benjamin,"Comedy, War","['Goldie Hawn', 'Sally Kirkland', 'Armand Assa...",['Howard Zieff'],"Nancy Meyers, Charles Shyer, Harvey Miller",English,69800000,USA,10 Oct 1980,109 min,6.1,Nominated for 3 Oscars. Another 1 win & 3 nomi...,tt0081375.jpg
3,Coal Miner's Daughter,"Biography, Drama, Music","['Tommy Lee Jones', ""Beverly D'Angelo"", 'Sissy...",['Michael Apted'],"Thomas Rickman (screenplay), Loretta Lynn (aut...",English,79900000,USA,07 Mar 1980,124 min,7.5,Won 1 Oscar. Another 8 wins & 13 nominations.,tt0080549.jpg
4,Smokey and the Bandit II,"Action, Comedy","['Sally Field', 'Burt Reynolds', 'Dom DeLuise'...",['Hal Needham'],"Hal Needham (characters created by), Robert L....","English, Spanish",66100000,USA,15 Aug 1980,100 min,5.1,1 win & 4 nominations.,tt0081529.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6021,Body,"Comedy, Drama","['Larry Fessenden', 'Alexandra Turshen']",['Malgorzata Szumowska'],"Michal Englert (screenplay), Malgorzata Szumow...","Polish, Spanish",2457,Poland,06 Mar 2015,90 min,6.6,6 wins & 4 nominations.,tt4358230.jpg
6022,Hayride 2,"Action, Horror, Thriller",[],['Terron R. Parsons'],Terron R. Parsons,English,1668,USA,18 Mar 2016,92 min,2.9,,tt2924352.jpg
6023,The Lady in the Car with Glasses and a Gun,Thriller,"['Freya Mavor', 'Stacy Martin']",['Joann Sfar'],"Patrick Godeau (screenplay), Sébastien Japriso...",French,2033,"France, Belgium",18 Dec 2015,93 min,5.5,,tt4159182.jpg
6024,The Looking Glass,"Drama, Music","['Dorothy Tristan', 'Trish Basinger', 'Ed Erns...",['John D. Hancock'],"Dorothy Tristan, Dorothy Tristan",English,1711,USA,27 Aug 2015,110 min,7.2,,tt2912776.jpg


In [43]:
unique_directors = set()
unique_actors = set()

for director_string in movie_df['Director']:
    cleaned_string = director_string.strip('[]')  # Remove the square brackets
    result_list = cleaned_string.split(',')  # Split the string using comma as a delimiter
    for director in result_list:
        unique_directors.add(director.strip())

        
for actor_string in movie_df['Actors']:
    cleaned_string = actor_string.strip("[]")  # Remove the square brackets
    result_list = cleaned_string.split(",")  # Split the string using comma as a delimiter
    for actor in result_list:
        unique_actors.add(actor.strip())

unique_actors = list(unique_actors) 
unique_directors = list(unique_directors)

unique_actors = [actor.replace("'", '') for actor in unique_actors][1::]
unique_directors = [director.replace("'", '') for director in unique_directors]

print(f'Number of directors : {len(unique_directors)}')
print(f'Number of actors : {len(unique_actors)}')

Number of directors : 3499
Number of actors : 9956


In [44]:
unique_actors[:10]

['Jennette McCurdy',
 'Jared Abrahamson',
 'Steve Blum',
 'Connie Britton',
 'Tim Owen',
 'Isabelle Fuhrman',
 'Charles Adam',
 'Maya Angelou',
 'John Amos',
 'Pierre Lebeau']

**Create matrix**

In [46]:
# Create dictionary of actors and their IDs
actors_dict = {actor: i for i, actor in enumerate(unique_actors)}
directors_dict = {director: i for i, director in enumerate(unique_directors)}

# Create new column 'Actors_id'
movie_df['Actors_id'] = movie_df['Actors'].apply(lambda x: [actors_dict[actor] for actor in x])
movie_df['Director_id'] = movie_df['Director'].apply(lambda x: [directors_dict[director] for director in x])

movie_df = movie_df.reset_index()

KeyError: '['

In [None]:
actor_to_actor_matrix = np.zeros((len(unique_actors),len(unique_actors)))
director_to_actor_matrix = np.zeros((len(unique_directors),len(unique_actors)))

In [None]:
for i in range(len(movie_df)):
    for j in range(len(movie_df['Director_id'][i])-1):
        for k in range(j+1,len(movie_df['Actors_id'][i])-1):
            director_to_actor_matrix[movie_df['Director_id'][i][j]][movie_df['Actors_id'][i][k]] = director_to_actor_matrix[movie_df['Director_id'][i][j]][movie_df['Actors_id'][i][k]]+ 1


In [None]:
for i in range(len(movie_df)):
    for j in range(len(movie_df['Actors_id'][i])-1):
        for k in range(j+1,len(movie_df['Actors_id'][i])-1):
            actor_to_actor_matrix[movie_df['Actors_id'][i][j]][movie_df['Actors_id'][i][k]] = actor_to_actor_matrix[movie_df['Actors_id'][i][j]][movie_df['Actors_id'][i][k]]+ 1
            actor_to_actor_matrix[movie_df['Actors_id'][i][k]][movie_df['Actors_id'][i][j]] = actor_to_actor_matrix[movie_df['Actors_id'][i][k]][movie_df['Actors_id'][i][j]]+ 1

In [None]:
np.max(actor_to_actor_matrix)

# Find movie with closest cast 

In [None]:
def intersection(lst1, lst2):
    return len(list(set(lst1) & set(lst2)))

def actor_intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

# Define a function to find the movie with the greatest intersection of actors
def find_movie_with_greatest_intersection(movie_name, df):
    movie = df[df['Title'] == movie_name]  # Get the movie with the given title
    movie_actors = movie['Actors'].iloc[0]  # Get the list of actors in the movie
    movie_index = movie.index[0]  # Get the index of the movie
    
    # Find the intersection of the movie actors with each other movie in the dataframe
    intersections = []
    for i in df.index:
        if i != movie_index:
            intersection_value = intersection(movie_actors, df.loc[i, 'Actors'])
            if intersection_value:
                intersections.append((intersection_value,i,actor_intersection(movie_actors, df.loc[i, 'Actors'])))
    
    max_intersect = max(intersections, key=lambda tup: tup[0], default=(0,0))
    if(max_intersect[0] >1 ):
        return (df['Title'].loc[max_intersect[1]],max_intersect[2])
    else:
        return "Sorry, no movie with close cast"

# Find actor/director that worked with most actors and on most movies

In [None]:
def get_n_rows_with_most_ones(M, n):
    # Calculate the number of 1s in each row
    row_sums = np.sum(M, axis=1)
    # Sort the row indices in descending order based on their number of 1s
    sorted_rows = np.argsort(row_sums)[::-1]
    # Return the top n rows with the most 1s
    return sorted_rows[:n]

def most_common_values(df, column_name, n):
    return df[column_name].explode().value_counts().nlargest(n).index.tolist()


In [None]:
directors_most_actors = []
for actor_id in list(get_n_rows_with_most_ones(director_to_actor_matrix,100)):
    directors_most_actors.append(unique_directors[actor_id])
    
directors_most_actors[:10]

In [None]:
directors_most_movies = []
directors_most_movies = most_common_values(movie_df,'Director',100)

directors_most_movies[:10]

In [None]:
actors_most_movies = []
actors_most_movies = most_common_values(movie_df,'Actors',500)

actors_most_movies[:10]

In [None]:
actors_most_actors = []
for actor_id in list(get_n_rows_with_most_ones(actor_to_actor_matrix,500)):
    actors_most_actors.append(unique_actors[actor_id])
    
actors_most_actors[:10]

# Filtered Dataset with only movies with famous actor or sufficient box office

In [None]:
movie_df['Box_office'] = movie_df['Box_office'].fillna(0).str.replace(',', '')
movie_df['Box_office'] = movie_df['Box_office'].astype(str).apply(lambda x: int(x) if x.isnumeric() else 0)

threshold = 100000000 # replace with your desired value

filtered_df = movie_df[movie_df['Actors'].apply(lambda x: any(item for item in x if item in actors_most_actors)) | (movie_df['Box_office'] >= threshold) | movie_df['Director'].apply(lambda x: any(item for item in x if item in directors_most_movies))].reset_index()
filtered_df = filtered_df.drop(['index', 'level_0'], axis=1) 


In [None]:

threshold = 100000000 # replace with your desired value

filtered_df = movie_df[movie_df['Actors'].apply(lambda x: any(item for item in x if item in actors_most_actors)) | (movie_df['Box_office'] >= threshold) | movie_df['Director'].apply(lambda x: any(item for item in x if item in directors_most_movies))].reset_index()
filtered_df = filtered_df.drop(['index', 'level_0'], axis=1) 

# Query

In [None]:
def display_row(df, row_name, name):
    display(filtered_df[filtered_df[row_name].apply(lambda x:name in x)])

In [None]:
display_row(filtered_df, "Title", "Ocean")

In [None]:
def display_actor(df, actor):
    display(filtered_df[filtered_df['Actors'].apply(lambda x: actor in x)])

In [None]:
display_actor(filtered_df, "Emma Watson")

In [None]:
filtered_df[filtered_df['Director'].apply(lambda x: "Quentin Tarantino" in x)]


In [None]:
filtered_df

In [None]:
filtered_unique_actors = set()
for actors_list in filtered_df['Actors']:
    for actor in actors_list:
        filtered_unique_actors.add(actor)
        
filter_unique_actors = list(filtered_unique_actors)