In [1]:
from src.utils.data_utils import load_dataframe_from_csv
from src.data.dataloader import load_initial_dataset
from src.utils.clean_cmu import clean_movies_cmu
from config import *

import plotly.express as px
import pandas as pd
import plotly
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go

%load_ext autoreload
%autoreload 2

In [2]:
movie_cmu = load_dataframe_from_csv('movie_cmu.csv')
character = load_dataframe_from_csv('character.csv')
plot_summaries = load_dataframe_from_csv('plot_summaries.csv')
tvtropes = load_dataframe_from_csv('tvtropes.csv')
name_clusters = load_dataframe_from_csv('name_clusters.csv')
oscar_winning_films = load_dataframe_from_csv('oscar_winning_films_ids.csv')
oscar_winning_actors = load_dataframe_from_csv('oscar_winning_actors.csv')
oscar_winning_actresses = load_dataframe_from_csv('oscar_winning_actresses.csv')
oscar_supporting_actors = load_dataframe_from_csv('oscar_winning_supporting_actors.csv')
oscar_supporting_actresses = load_dataframe_from_csv('oscar_winning_supporting_actresses.csv')
extended_films = load_dataframe_from_csv('film_2015_2024.csv')
academy_award_winning_films = load_dataframe_from_csv('acedemy_award_winning_films')
winning_actors_info = load_dataframe_from_csv('winning_actors_information.csv')
imdb_ratings = load_dataframe_from_csv('imdb_ratings.csv')
oscar_actors_info = load_dataframe_from_csv('oscar_actors_info.csv')
oscar_actress_info = load_dataframe_from_csv('oscar_actress_info.csv')
new_film_dataset = load_dataframe_from_csv('film_full_2.csv')

  df_loaded = pd.read_csv(csv_file_path)


In [3]:
extended_films["Movie genres"] = extended_films["genres"]
extended_films["Movie release date"] = extended_films["release_date"]
extended_films["Movie languages"] = extended_films["languages"]
extended_films["Movie countries"] = extended_films["countries"]
extended_films["Movie name"] = extended_films["film"]
extended_films["Movie box office revenue"] = extended_films["box_office"]
extended_films["Movie runtime"] = extended_films["runtime"]
extended_films["Wikipedia movie ID"] = extended_films["page_id"]
extended_films = extended_films.drop(columns=["genres", "release_date", "languages", "countries", "release date", "film", "box_office", "runtime", "page_id"])

winning_actors_info["Actor name"] = winning_actors_info["actor"]
winning_actors_info = winning_actors_info.drop(columns=["actor"])

imdb_ratings['Movie release date'] = imdb_ratings['title_year']  
imdb_ratings['Movie name'] = imdb_ratings['movie_title'] 
imdb_ratings = imdb_ratings.drop(columns=['title_year', 'movie_title'])

new_film_dataset['Wikipedia movie ID'] = new_film_dataset['wikipedia_id']
new_film_dataset['Movie name'] = new_film_dataset['title']
new_film_dataset['Movie release date'] = new_film_dataset['release_date']
new_film_dataset['Movie genres'] = new_film_dataset['categories']
new_film_dataset['Movie box office revenue'] = new_film_dataset['box_office']

new_film_dataset.drop(columns=['wikipedia_id', 'title', 'release_date', 'categories', 'box_office'], inplace=True)



In [4]:
oscar_act = pd.concat([oscar_winning_actors, oscar_winning_actresses], axis=0)
oscar_act_movies = pd.merge(oscar_act, new_film_dataset, left_on='film_id', right_on='Wikipedia movie ID')
oscar_act_movies.drop(columns=['film_id'], inplace=True)

oscar_actress_info_copy = pd.merge(winning_actors_info, oscar_act_movies, left_on='page_id', right_on='Actress id')
oscar_actors_info_copy = pd.merge(winning_actors_info, oscar_act_movies, left_on='page_id', right_on='Actors id')
oscar_act_movies = pd.concat([oscar_actors_info_copy, oscar_actress_info_copy], axis=0)
# Check the type and standardize the date format
def standardize_date_format(date):
    if pd.isna(date):  # Handle NaN values explicitly
        return None  # Or return a default value, e.g., -1 or an empty string
    if isinstance(date, str):  # If it's a string (datetime-like), process it
        return pd.to_datetime(date).year  # Extract just the year from the datetime
    elif isinstance(date, float):  # If it's a float, treat it as a year
        return int(date)  # Convert float year to integer year
    return None  # Handle missing or invalid data

# Apply the function to create a new column with unified format
oscar_act_movies['Movie release year'] = oscar_act_movies['Movie release date'].apply(standardize_date_format)

We utilized two primary datasets for our analysis of Oscar-winning actors and actresses.

1)	“OSCAR_ACT_MOVIES”  
This dataset is a refined version of the film_full_2.csv file. It includes only the movies in which an actor or actress won an Oscar. Additionally, we added two columns: 'Actors id' and 'Actress id', which represent the Wikipedia page IDs of the respective actors and actresses. These columns are accompanied by the corresponding names of the actors and actresses ('Actor name' column) for easy reference. Below, you will find a detailed description of the dataset’s columns, the total number of movies, and a complete list of the actors and actresses included.

In [5]:
print('OSCAR_ACT_MOVIES number of movies:', oscar_act_movies.shape[0])
print('')
print('OSCAR_ACT_MOVIES Columns:', oscar_act_movies.columns)


OSCAR_ACT_MOVIES number of movies: 190

OSCAR_ACT_MOVIES Columns: Index(['page_id', 'wikidata_id', 'sexLabel', 'nativeLanguageLabel',
       'countryOfCitizenshipLabel', 'ethnicGroupLabel', 'Actor name',
       'Actors id', 'Actress id', 'dataset_id', 'runtime', 'languages',
       'countries', 'genres', 'reviewScores', 'awardsReceived',
       'awardsNominated', 'capitalCost', 'release date', 'nbOscarReceived',
       'nbOscarNominated', 'Wikipedia movie ID', 'Movie name',
       'Movie release date', 'Movie genres', 'Movie box office revenue',
       'Movie release year'],
      dtype='object')


In [6]:
# Import necessary libraries
from ipywidgets import widgets, interact, Layout

# Function to load the CSV and process the data
def load_and_display(df, column_name):
    
    actor_actress_list = sorted(df[column_name].dropna().unique())
    
    # Function to search for names
    def search_name(search_term):
        if not search_term.strip():
            return ["Enter a name to search."]
        matches = [name for name in actor_actress_list if search_term.lower() in name.lower()]
        return matches if matches else ["No matches found."]
    
    print("Search for an actor or actress:")
    search_bar = widgets.Text(
        value="",
        placeholder="Type a name...",
        description="Search:",
        layout=Layout(width='50%')
    )
    output = widgets.Output()
    
    def on_text_change(change):
        with output:
            output.clear_output(wait=True)
            results = search_name(change["new"])
            for result in results:
             print(result)
    
    search_bar.observe(on_text_change, names='value')
    display(search_bar, output)

    print("\nComplete list of actors and actresses:")
    for name in actor_actress_list:
        print(name)


load_and_display(oscar_act_movies, 'Actor name')

print('')
print('Number of actors:', oscar_act_movies['sexLabel'].value_counts().get('male', 0))
print('Number of actresses:', oscar_act_movies['sexLabel'].value_counts().get('female', 0))
print('Total:', oscar_act_movies['sexLabel'].value_counts().get('male', 0) + oscar_act_movies['sexLabel'].value_counts().get('female', 0))


Search for an actor or actress:


Text(value='', description='Search:', layout=Layout(width='50%'), placeholder='Type a name...')

Output()


Complete list of actors and actresses:
Adrien Brody
Al Pacino
Alec Guinness
Anna Magnani
Anne Bancroft
Anthony Hopkins
Art Carney
Audrey Hepburn
Barbra Streisand
Ben Kingsley
Bette Davis
Bing Crosby
Brendan Fraser
Brie Larson
Broderick Crawford
Burt Lancaster
Casey Affleck
Charles Laughton
Charlize Theron
Charlton Heston
Cher
Cillian Murphy
Clark Gable
Claudette Colbert
Cliff Robertson
Colin Firth
Daniel Day-Lewis
Denzel Washington
Diane Keaton
Dustin Hoffman
Elizabeth Taylor
Ellen Burstyn
Emil Jannings
Emma Stone
Emma Thompson
Ernest Borgnine
F. Murray Abraham
Faye Dunaway
Forest Whitaker
Frances McDormand
Fredric March
Gary Cooper
Gary Oldman
Gene Hackman
Geoffrey Rush
George Arliss
George C. Scott
Geraldine Page
Ginger Rogers
Glenda Jackson
Grace Kelly
Greer Garson
Gregory Peck
Gwyneth Paltrow
Halle Berry
Helen Hayes
Helen Hunt
Helen Mirren
Henry Fonda
Hilary Swank
Holly Hunter
Humphrey Bogart
Ingrid Bergman
Jack Lemmon
Jack Nicholson
James Cagney
James Stewart
Jamie Foxx
Jane Fond

In [7]:
# Prepare data for box office revenue evolution
oscar_revenue_data_corrected = oscar_act_movies[['Movie box office revenue', 'Movie release year']].dropna()
oscar_revenue_data_corrected = oscar_revenue_data_corrected.dropna(subset=['Movie release year'])

# Group by year and calculate total box office revenue
oscar_revenue_data_corrected['Year'] = oscar_revenue_data_corrected['Movie release year']
revenue_by_year_corrected = oscar_revenue_data_corrected.groupby('Year')['Movie box office revenue'].sum().reset_index()

# Create a line chart
fig = px.line(
    revenue_by_year_corrected,
    x='Year',
    y='Movie box office revenue',
    title='Evolution of Movie Box Office Revenue Over the Years for Movies with an Academy Award for Best Actor/Actress',
    labels={'Year': 'Year', 'Movie box office revenue': 'Box Office Revenue ($)'},
    line_shape='spline'
)

# Update layout for better readability
fig.update_layout(
    xaxis=dict(title='Year'),
    yaxis=dict(title='Total Box Office Revenue ($)', tickformat='$,.0f'),
    width=1400, 
    height=600  
)

print('The filtered dataset changed the main statistics of the original full_fim_2.csv dataset and especially the distribution of the box office revenue :')
fig.show()

The filtered dataset changed the main statistics of the original full_fim_2.csv dataset and especially the distribution of the box office revenue :


2) OSCAR_ACT_MOVIES_ALL

This dataset includes all movies featuring an actor or actress who has won an Oscar to date. The key information provided in this dataset includes:  
	•	Actor/Actress height  
	•	Actor/Actress ethnicity  
	•	Actor’s age at the time of the movie’s release  
	•	An Academy Award column indicating whether the actor/actress won an Oscar for that specific movie  
	•	The character name portrayed by the actor/actress in the movie  

This dataset is derived from the CMU dataset, which only contains movies released between 1928 and the end of 2012.  

A complete list of movies can be found below.  

In [8]:
oscar_act_info = oscar_act_movies.drop(columns=['sexLabel'])  #Remove the columns that are in character.csv
oscar_act_info = pd.merge(oscar_act_info, character[['Wikipedia movie ID', 'Actor name', 'Actor age at movie release', 'Actor date of birth', 'Actor ethnicity','Character name', 'Freebase actor ID']], on=['Wikipedia movie ID', 'Actor name'])
oscar_act_movies_all = character[character['Freebase actor ID'].isin(oscar_act_info['Freebase actor ID'])]
oscar_act_movies_all = oscar_act_movies_all.drop(columns=['Freebase movie ID', 'Movie release date'])
oscar_act_movies_all = oscar_act_movies_all.merge(movie_cmu, on='Wikipedia movie ID', how='left')
# Create a set of unique (Actor name, Wikipedia movie ID) combinations for rewards
best_act_movies = set(
    zip(oscar_act_info['Actor name'], oscar_act_info['Wikipedia movie ID'])
)

# Add a new column indicating if the movie is a Best Actress Reward
oscar_act_movies_all['Academy Award'] = oscar_act_movies_all.apply(
    lambda row: (row['Actor name'], row['Wikipedia movie ID']) in best_act_movies,
    axis=1
)


print('OSCAR_ACT_MOVIES_ALL Columns:', oscar_act_movies_all.columns)
print('')
print('OSCAR_ACT_MOVIES_ALL number of movies:', oscar_act_movies_all.shape[0])

OSCAR_ACT_MOVIES_ALL Columns: Index(['Wikipedia movie ID', 'Character name', 'Actor date of birth',
       ' Actor gender', 'Actor height)', 'Actor ethnicity', 'Actor name',
       'Actor age at movie release', 'Freebase character',
       'Freebase character ID', 'Freebase actor ID', 'Freebase movie ID',
       'Movie name', 'Movie release date', 'Movie box office revenue',
       'Movie runtime', 'Movie languages', 'Movie countries', 'Movie genres',
       'Academy Award'],
      dtype='object')

OSCAR_ACT_MOVIES_ALL number of movies: 6867


In [9]:
# Function to load the CSV and process the data
def load_and_display(df, column_name):
    
    actor_actress_list = sorted(df[column_name].dropna().unique())
    
    # Function to search for names
    def search_name(search_term):
        if not search_term.strip():
            return ["Enter a movie to search."]
        matches = [name for name in actor_actress_list if search_term.lower() in name.lower()]
        return matches if matches else ["No matches found."]
    
    print("Search for a movie:")
    search_bar = widgets.Text(
        value="",
        placeholder="Type the name of a movie...",
        description="Search:",
        layout=Layout(width='50%')
    )
    output = widgets.Output()
    
    def on_text_change(change):
        with output:
            output.clear_output(wait=True)
            results = search_name(change["new"])
            for result in results:
             print(result)
    
    search_bar.observe(on_text_change, names='value')
    display(search_bar, output)

    print("\nComplete list of movies:")
    for name in actor_actress_list:
        print(name)


load_and_display(oscar_act_movies_all, 'Movie name')     


Search for a movie:


Text(value='', description='Search:', layout=Layout(width='50%'), placeholder='Type the name of a movie...')

Output()


Complete list of movies:
$9.99
'Neath the Arizona Skies
'night, Mother
*batteries not included
...And Justice for All
...First Do No Harm
...tick...tick...tick...
10
100 Films and a Funeral
10:30 P.M. Summer
11'09"01 September 11
11:14
12 Angry Men
13 Rue Madeleine
13th Child
15 Minutes
1870
1900
1919
2 Days in the Valley
20 Mule Team
20,000 Leagues Under the Sea
20,000 Years in Sing Sing
2010: The Year We Make Contact
21
21 Days
21 Grams
21 and a Wake-Up
24: Redemption
25th Hour
28 Days
3 Women
360
3:10 to Yuma
4 Devils
42nd Street
45 Minutes from Hollywood
55 Days at Peking
5th Ave Girl
633 Squadron
7 Women
8 Million Ways to Die
84 Charing Cross Road
88 Minutes
8mm
99 Francs
A Beautiful Mind
A Bed Among the Lentils
A Big Hand for the Little Lady
A Bill of Divorcement
A Breath of Scandal
A Bridge Too Far
A Bronx Tale
A Bug's Life
A Certain Smile
A Change of Seasons
A Child Is Waiting
A Chorus Line
A Chorus of Disapproval
A Christmas Carol
A Civil Action
A Clever Dummy
A Common Man
A 