In [92]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from helpers.readers import read_dataframe

# Load datasets and prepare merged dataframes

In [2]:
cmu_movies = read_dataframe(
    name='cmu/movies',
    preprocess=True,
    usecols=[
        "Wikipedia movie ID",
        "Freebase movie ID",
        "Movie name",
        "Movie release date",
        "Movie box office revenue",
        "Movie runtime",
        "Movie languages",
        "Movie countries",
        "Movie genres",
    ]
)
imdb_info = read_dataframe(name='imdb/movies', preprocess=True)
imdb_ratings = read_dataframe(name='imdb/ratings')
movieLens_movies = read_dataframe(name='movieLens/movies', preprocess=True)
movieLens_ratings = read_dataframe('movieLens/ratings')

imdb_crew = read_dataframe(name='imdb/crew')
imdb_people = read_dataframe(name='imdb/names')
imdb_principals = read_dataframe(name='imdb/principals')

Preprocess logs:
✅ Fixed Movie Languages inside Movie Countries
✅ Removed Deseret characters
✅ Movie release date splitted to three columns: Movie release Year, Movie release Month, Movie release Day
✅ Seperated freebase identifiers from Movie Languages, Movie Countries and Movie Genres
Preprocess logs:
✅ Moved genres from runtimeMinutes to genres column
Preprocess logs:
✅ Aligned bad rows


## Prepare the movies dataframe

In [3]:
mapping_w_i_f = read_dataframe(name='mapping_wikipedia_imdb_freebase')
mapping_f_i = read_dataframe(name='mapping_freebase_imdb')

In [4]:
movies = cmu_movies.drop(['Movie release Day', 'Movie release Month'], axis=1).copy()

In [5]:
movies.rename(
    columns={
        'Wikipedia movie ID': 'wikipediaID',
        'Freebase movie ID': 'freebaseID',
        'Movie name': 'title',
        'Movie box office revenue': 'revenue',
        'Movie runtime': 'runtime',
        'Movie languages': 'languages',
        'Movie countries': 'countries',
        'Movie genres': 'genres',
        'Movie release Year': 'release',
    },
    inplace=True,
)

In [6]:
movies = movies.merge(
    right=mapping_f_i.drop_duplicates(subset='freebase'),
    left_on='freebaseID', right_on='freebase', how='left'
).rename(columns={'imdb': 'tconst'}).drop('freebase', axis=1)

In [7]:
movies.tconst.duplicated().sum()
movies.drop_duplicates(subset='tconst', inplace=True)

In [8]:
movies = movies.merge(
    right=imdb_info.rename(columns={'genres': 'genres_imdb', 'runtimeMinutes': 'runtime_imdb'})[['tconst', 'isAdult', 'runtime_imdb', 'genres_imdb']],
    on='tconst', how='left',
)

In [9]:
movies = movies.merge(
    right=imdb_ratings.rename(columns={'averageRating': 'rating', 'numVotes': 'votes'}),
    on='tconst', how='left',
)

In [10]:
# NOTE: Only adds ratings for 100 movies, not worth it

# movies = movies.merge(
#     right=movieLens_movies[['vote_average', 'vote_count', 'imdb_id']].rename(columns={'vote_average': 'rating_lens', 'vote_count': 'votes_lens', 'imdb_id': 'tconst'}),
#     on='tconst', how='left',
# )
# movies.rating_lens.replace(to_replace=0, value=pd.NA)

In [10]:
movies = movies.merge(right=imdb_crew.drop('writers', axis=1), on='tconst', how='left')

In [11]:
imdb_people_exploded = imdb_people.copy()
imdb_people_exploded['knownForTitles'] = imdb_people['knownForTitles'].str.split(',')
imdb_people_exploded = imdb_people_exploded.explode(['knownForTitles'])


merged_ipe = pd.merge(imdb_people_exploded,mapping_f_i.drop_duplicates(subset='freebase'), how='inner', left_on='knownForTitles', right_on='imdb')
unique_matched_persons = merged_ipe.drop_duplicates(subset='nconst', keep='first')

In [12]:
matched_imdb_people = pd.merge(imdb_people, unique_matched_persons[['nconst']], on='nconst', how='inner')
tmp_principal_people = pd.merge(imdb_principals, matched_imdb_people[['nconst']], on='nconst', how='inner')
matched_principal_people = pd.merge(tmp_principal_people,mapping_f_i.drop_duplicates(subset='freebase')[['imdb']],left_on='tconst',right_on='imdb',how='inner')

## Prepare the directors dataframe

In [234]:
nmconsts = []
for item in movies.dropna(subset='directors').directors.str.split(','):
    nmconsts.extend(item)
nmconsts = set(nmconsts)

print(f'We have {len(nmconsts)} directors.')

We have 28498 directors.


In [254]:
directors = imdb_people[imdb_people.nconst.isin(nmconsts)].copy()

In [266]:
def get_movie_crew_size() :
    """ 
    Returns a dataframe containing all the people that worked on a movie, using the IMDb people data.

    Args:
    imdb_id: the IMDb ID of the movie

    Returns:
    movie_crew: A dataframe containing information of all people that worked on a film.
    """

    

In [50]:
import networkx as nx
from tqdm import tqdm

In [175]:
from collections import defaultdict
def counting_collaborations(directors_df, matched_imdb_people):
    """
    Returns a dictionnary containing the number of collaborations between directors and different crew members, for the movies selected.

    Args :
    directors_df : A dataframe containing information about directors. it must contain at least the two following columns : 'nconst', the personal
                   IMDb ID of the director, and 'knownForTitles' which are the titles for which the director is known.
    
                   
    matched_imdb_people : A dataframe containing personal information about people that have worked on movies that we decided to retain. Must also contain
                          the 'nconst' and 'knownForTitles' columns.
    
    """
    
    # Create a mapping of titles to people
    title_to_people = defaultdict(set)
    for _, person_row in tqdm(matched_imdb_people.iterrows()):
        person_id = person_row['nconst']
        titles = person_row['knownForTitles'].split(',')
        for title in titles:
            title_to_people[title].add(person_id)

    collaboration_counts = {}

    for _, row in tqdm(directors_df.iterrows()):
        director_ids = set(row['nconst'].split(',')) if isinstance(row['nconst'], str) else set()
        directed_titles = set(row['knownForTitles'].split(',')) if isinstance(row['knownForTitles'], str) else set()

        for director_id in director_ids:
            if director_id not in collaboration_counts:
                collaboration_counts[director_id] = {}

            for title in directed_titles:
                collaborators = title_to_people[title] - {director_id}  # Remove the director from collaborators
                for collaborator in collaborators:
                    if collaborator not in collaboration_counts[director_id]:
                        collaboration_counts[director_id][collaborator] = 1
                    else:
                        collaboration_counts[director_id][collaborator] += 1

    return collaboration_counts

In [75]:
def get_all_known_roles(matched_principal,all_matched_people) : 
    """
    Adds additional roles that people present in the matched_principal dataframe passed as argument have
    played.

    Args:
    matched_principal: The IMDb 'principal' dataframe matched for our selected movies. Contains
                       information about the 'principal' crew of each movie.

    all_matched_people: The IMDb 'people' dataframe matched for our selected movies. Contains
                        personal information about people in the film industry.

    Returns:
    enhanced_matched_principal: The all_matched_people dataframe passed as argument, with an additional column
                                containing any additional roles found in the matched_principal
                                dataframe. 
    """
    
    roles_collected = {}

    # Iterate over matched_principal to collect roles for each person
    for _, row in tqdm(matched_principal.iterrows()):
        nconst = row['nconst']
        tconst = row['tconst']
        
        if nconst not in roles_collected:
            roles_collected[nconst] = set()

        roles_collected[nconst].add(tconst)

    # Update enhanced_matched_principal with the collected roles
    enhanced_matched_principal = all_matched_people.copy()
    enhanced_matched_principal['all_known_roles'] = enhanced_matched_principal['nconst'].map(lambda x: ', '.join(map(str, roles_collected.get(x, []))))

    return enhanced_matched_principal

In [76]:
test = get_all_known_roles(matched_principal_people,matched_imdb_people)

617870it [00:30, 20470.01it/s]


In [176]:
collab_counts = counting_collaborations(directors,matched_imdb_people)

0it [00:00, ?it/s]

1476093it [02:48, 8755.46it/s] 
28498it [00:05, 5524.61it/s]


In [256]:
from helpers.readers import read_dataframe
awards = read_dataframe('imdb/awards')

In [257]:
awards = awards[~awards['isTitle']]

In [258]:
directors_awards = (pd.merge(awards,directors['nconst'],left_on= 'const', right_on='nconst', how='inner')).drop('const',axis=1)

directors_awards['categoryName_lower'] = directors_awards['categoryName'].str.lower()
directors_awards['nomeneeNote_lower'] = directors_awards['nomeneeNote'].str.lower()

directors_awards['isPrimary'] = directors_awards['isPrimary'].astype(bool)
directors_awards['isSecondary'] = directors_awards['isSecondary'].astype(bool)

# Filter awards for directorial roles
directors_awards = directors_awards[
    (directors_awards['isPrimary'] | directors_awards['isSecondary']) & 
    ((directors_awards['categoryName_lower'].isna()) | (~directors_awards['categoryName_lower'].str.contains('actor|actress'))) & 
    ((directors_awards['nomeneeNote_lower'].isna()) | (~directors_awards['nomeneeNote_lower'].str.contains('actor|actress')))
]

# Drop the temporary lowercase columns
directors_awards = directors_awards.drop(['categoryName_lower', 'nomeneeNote_lower'], axis=1)

display(director_awards)

Unnamed: 0,eventId,eventName,awardName,year,occurrence,winAnnouncementTime,categoryName,nomeneeNote,name,originalName,...,episodeNames,characterNames,isWinner,isPrimary,isSecondary,isPerson,isTitle,isCompany,notes,nconst
0,ev0000263,French Syndicate of Cinema Critics,Critics Award,1992,1,,Best Film,,Jacques Rivette,,...,,,True,True,True,True,False,False,,nm0729626
1,ev0000091,Berlin International Film Festival,Golden Berlin Bear,2007,1,,,,Jacques Rivette,,...,,,False,True,True,True,False,False,,nm0729626
2,ev0000400,Locarno International Film Festival,Special Prize of the Jury,1974,1,,,,Jacques Rivette,,...,,,True,True,True,True,False,False,,nm0729626
3,ev0001574,Gaudí Awards,Gaudí Award,2009,1,,Best European Film (Millor Pel·lícula Europea),(director),Jacques Rivette,,...,,,False,True,True,True,False,False,France/Italy,nm0729626
4,ev0000450,Moscow International Film Festival,Golden St. George,1995,1,,,,Jacques Rivette,,...,,,False,True,True,True,False,False,,nm0729626
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187216,ev0000091,Berlin International Film Festival,Golden Berlin Bear,1960,1,,,,Ville Salminen,,...,,,False,True,True,True,False,False,,nm0758744
187217,ev0000091,Berlin International Film Festival,Golden Berlin Bear,1960,1,,,,Prabhat Mukherjee,,...,,,False,True,True,True,False,False,,nm0611542
187218,ev0000091,Berlin International Film Festival,Silver Berlin Bear,1960,1,,Special Prize - Short Film,,Juan Berend,,...,,,True,True,True,True,False,False,,nm0073501
187219,ev0000409,AFI Fest,Documentary Award - Special Mention,2001,1,,,,Jan Louter,,...,['A Sad Flower in the Sand (#8.7)'],,True,True,True,True,False,False,"For episode ""A Sad Flower in the Sand (#8.7)"".",nm1098324


In [259]:
import pandas as pd

# Count total nominations (wins + losses) for each nconst
total_nominations = director_awards.groupby('nconst').size().reset_index(name='total_nominations')

# Count number of wins for each nconst
wins_count = director_awards[director_awards['isWinner'] == 'True'].groupby('nconst').size().reset_index(name='wins_count')

# Merge the two counts based on 'nconst'
nominations_and_wins = pd.merge(total_nominations, wins_count, on='nconst', how='left').fillna(0)
nominations_and_wins['wins_count'] = nominations_and_wins['wins_count'].astype(int)
nominations_and_wins['total_nominations'] = nominations_and_wins['total_nominations'].astype(int)

display(nominations_and_wins)

Unnamed: 0,nconst,total_nominations,wins_count
0,nm0000005,130,76
1,nm0000008,15,8
2,nm0000009,13,8
3,nm0000018,40,27
4,nm0000019,114,65
...,...,...,...
16444,nm8956708,1,1
16445,nm9054338,2,1
16446,nm9335192,2,1
16447,nm9801575,1,0


In [260]:
directors = pd.merge(directors,nominations_and_wins,on='nconst',how='left')
directors['total_nominations'].fillna(0, inplace=True)
directors['wins_count'].fillna(0, inplace=True)
directors['total_nominations'] = directors['total_nominations'].astype(int)
directors['wins_count'] = directors['wins_count'].astype(int)
display(directors)

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,total_nominations,wins_count
0,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0083922,tt0050976,tt0069467",130,76
1,nm0000008,Marlon Brando,1924,2004,"actor,soundtrack,director","tt0047296,tt0078788,tt0070849,tt0068646",15,8
2,nm0000009,Richard Burton,1925,1984,"actor,soundtrack,producer","tt0087803,tt0057877,tt0059749,tt0061184",13,8
3,nm0000018,Kirk Douglas,1916,2020,"actor,producer,soundtrack","tt0049456,tt0050825,tt0054331,tt0080736",40,27
4,nm0000019,Federico Fellini,1920,1993,"writer,director,actor","tt0050783,tt0053779,tt0056801,tt0071129",114,65
...,...,...,...,...,...,...,...,...
28493,nm9923550,Peder Pedersen,,,"director,miscellaneous,editor","tt1989553,tt1441426,tt0497368,tt1504682",0,0
28494,nm9928872,K.S. Raveendran,,,"writer,director",tt4507090,0,0
28495,nm9965211,Vasanth Balan,,,director,tt8685998,0,0
28496,nm9965267,Ramesh Khanna,,,director,tt0274959,0,0
