In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException

import time

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from datetime import datetime, timedelta

import requests
import pandas as pd
import json

In [None]:
url = 'https://today.yougov.com/ratings/entertainment/popularity/all-time-actors-actresses/all'
path = "Path" # Path to webdriver executable

chrome_options = Options()
chrome_options.binary_location = r"C:\Program Files\Google\Chrome\Application\chrome.exe"

# Use only chrome_options, not executable_path
driver = webdriver.Chrome(options=chrome_options)

driver.get(url)


# Decline cookies settings
try:
    decline_button = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, 'onetrust-reject-all-handler'))
    )

    decline_button.click()
except Exception as e:
    pass

# Scroll down to load more data
while True:
    # Scroll to the bottom of the page
    driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)

    # Wait for a short interval to allow the content to load
    time.sleep(2)

    # Check if there are more actors loaded
    try:
        load_more_button = WebDriverWait(driver, 100).until(
            EC.element_to_be_clickable((By.CLASS_NAME, 'load-more-button'))
        )
        
        load_more_button.click()
    except TimeoutException:
        break

html = driver.page_source
driver.quit()

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Element containing the list of actors
actors_container = soup.find('div', class_='rankings-entities-list-container')

# Actor's list items
actor_items = actors_container.find_all('li', class_='ng-star-inserted')

# Create lists to store data
actor_names = []
popularity_percentages = []
fame_percentages = []

for actor_item in actor_items:
    # Extract the actor name
    actor_name = actor_item.find('img', class_='ng-star-inserted')['alt'] if actor_item.find('img', class_='ng-star-inserted') else None

    # Extract popularity percentage
    popularity_element = actor_item.find('span', class_='compact')
    popularity_percentage = popularity_element.text.split('%')[0] if popularity_element else None

    # Extract fame percentage
    fame_element = actor_item.find('span', class_='rankings-item-active')
    fame_percentage = fame_element.text.split('%')[0] if fame_element else None

    actor_names.append(actor_name)
    popularity_percentages.append(popularity_percentage)
    fame_percentages.append(fame_percentage)

data = {
    'Actor Name': actor_names,
    'Popularity (%)': popularity_percentages,
    'Fame (%)': fame_percentages
}

famous_actors = pd.DataFrame(data)

print(famous_actors)
famous_actors.to_csv('../Data/preprocessed_data/famous_actors_db.csv', index=False)


# Movie trajectory for actors

In [13]:
# This database contains movie-actor pairs data
tmdb_id2credit = json.load(open('../Data/tmdb_resources/tmdb_id2credit_full.json'))
id2credit_df = pd.DataFrame.from_dict(tmdb_id2credit, orient='index')
id2credit_df = id2credit_df.drop(columns='crew')

# This database contains revenue and imdb ratings information for movies
tmdb_id2credit_imdb_rating = json.load(open('../Data/tmdb_resources/tmdb_id2detail_imdb_rating.json'))
imdb_rating_df = pd.DataFrame.from_dict(tmdb_id2credit_imdb_rating, orient='index')

# Get list of actors of interest that appeared in movies in 2003 and later
actors_df = pd.read_csv('../Data/preprocessed_data/tmdb_acting_in_2003_and_later.csv')

In [25]:
import numpy as np
def get_portion_by_order_logscale(order):
    return np.power(np.e, 0.7756578 - 0.04791*order)

In [30]:
tmdb_id2portions_df = pd.read_csv('../Data/preprocessed_data/tmdb_id2portion.csv')
# rename tmdb_id to movie_id and order to cmu_order
tmdb_id2portions_df = tmdb_id2portions_df.rename(columns={'tmdb_id': 'movie_id'})
tmdb_id2portions_df = tmdb_id2portions_df[tmdb_id2portions_df['name'].isin(actors_df['name'])]
columns_to_drop = ['character', 'name', 'output_portion', 'method' ]
tmdb_id2portions_df = tmdb_id2portions_df.drop(columns=columns_to_drop)

# merge full_actors_df and tmdb_id2portions_df, on=['movie_id', 'actor_id']

display(tmdb_id2portions_df)

Unnamed: 0.1,Unnamed: 0,movie_id,actor_id,order,scaled_portion
0,0,37680,18702,0,49.763673
1,1,37680,23346,1,2.453297
2,2,37680,3197,2,0.315102
3,3,37680,2547,3,5.266712
5,5,37680,11160,5,0.045015
...,...,...,...,...,...
650048,650048,2668,27763,4,15.389398
650050,650050,2668,3796,8,2.564900
650053,650053,2668,5658,3,17.565677
650064,650064,1907,3063,3,30.780399


In [14]:
# Only keep the columns we need
columns_to_keep = ['id', 'original_title', 'release_date', 'original_language', 'genres', 'budget', 'revenue', 'imdb_average_rating']
imdb_rating_filtered_df = imdb_rating_df[columns_to_keep]
imdb_rating_filtered_df.reset_index(drop=True, inplace=True)

# Merge movie-actor pairs data with movie data
merged_df = pd.merge(imdb_rating_filtered_df, id2credit_df, on='id', how='inner')
print(f'The number of movies in the merged dataframe is {merged_df.shape[0]}')

The number of movies in the merged dataframe is 862639


In [15]:
# Check for empty lists in the 'cast' column and drop corresponding rows
merged_df_filtered = merged_df[merged_df['cast'].apply(lambda x: len(x) > 0)]
# Filter rows with empty release dates and Nan imdb_average_rating
merged_df_filtered = merged_df_filtered[(merged_df_filtered['release_date'] != '') & (merged_df_filtered['imdb_average_rating'].notna()) ] 
merged_df_filtered.reset_index(drop=True, inplace=True)
merged_df_filtered = merged_df_filtered.rename(columns={'id': 'movie_id'})

print(f'The number of movies with non empty information is {merged_df_filtered.shape[0]:,}')
display(merged_df_filtered)

The number of movies with non empty information is 331,376


Unnamed: 0,movie_id,original_title,release_date,original_language,genres,budget,revenue,imdb_average_rating,cast
0,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9,"[{'adult': False, 'gender': 1, 'id': 57395, 'k..."
1,263493,The Gangsters,1913-05-29,en,"[{'id': 35, 'name': 'Comedy'}]",0,0,6.8,"[{'adult': False, 'gender': 2, 'id': 1086663, ..."
2,50944,The Sorcerer's Apprentice,2002-04-12,en,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",0,0,4.2,"[{'adult': False, 'gender': 1, 'id': 46948, 'k..."
3,33592,White of the Eye,1987-06-19,en,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",0,0,6.1,"[{'adult': False, 'gender': 2, 'id': 18181, 'k..."
4,43839,Alexander's Ragtime Band,1938-05-24,en,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",2000000,4000000,6.8,"[{'adult': False, 'gender': 2, 'id': 10922, 'k..."
...,...,...,...,...,...,...,...,...,...
331371,369427,The American Werewolf Project,2015-06-30,en,"[{'id': 27, 'name': 'Horror'}]",0,0,4.2,"[{'adult': False, 'gender': 0, 'id': 1355468, ..."
331372,770135,Scandal in 97,2020-11-29,en,"[{'id': 53, 'name': 'Thriller'}]",0,0,4.9,"[{'adult': False, 'gender': 2, 'id': 1232652, ..."
331373,472471,Sniff!!!,2017-08-25,hi,"[{'id': 12, 'name': 'Adventure'}, {'id': 35, '...",0,0,5.3,"[{'adult': False, 'gender': 0, 'id': 1963881, ..."
331374,707790,マッハ’78,1978-02-25,ja,"[{'id': 28, 'name': 'Action'}]",0,0,6.1,"[{'adult': False, 'gender': 0, 'id': 2767640, ..."


### Cast preprocessing

In [29]:
# Creating full_actors_df DataFrame
full_actors_list = []

for _, row in merged_df_filtered.iterrows():
    for actor_info in row['cast']:
        actor_entry = {
            'gender': actor_info['gender'],
            'actor_id': actor_info['id'],
            'name': actor_info['name'],
            'popularity': actor_info['popularity'],
            'character': actor_info['character'],
            'order': actor_info['order'],
            'movie_id': row['movie_id'],
            'original_title': row['original_title'],
            'release_date': row['release_date'],
            'original_language': row['original_language'],
            'genres': row['genres'],
            'budget': row['budget'],
            'revenue': row['revenue'],
            'imdb_average_rating': row['imdb_average_rating']
        }
        full_actors_list.append(actor_entry)

full_actors_df = pd.DataFrame(full_actors_list)

# Filter full_actors_df to keep only the actors in actors_df
full_actors_df = full_actors_df[full_actors_df['name'].isin(actors_df['name'])]
full_actors_df.reset_index(drop=True, inplace=True)
print(f'The number of apparitions in movies of actors of interest is {full_actors_df.shape[0]:,}')
display(full_actors_df)

The number of apparitions in movies of actors of interest is 307,006


Unnamed: 0,gender,actor_id,name,popularity,character,order,movie_id,original_title,release_date,original_language,genres,budget,revenue,imdb_average_rating
0,1,57395,Natasha Henstridge,49.626,Lt. Melanie Ballard,0,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
1,2,9778,Ice Cube,24.423,James 'Desolation' Williams,1,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
2,1,2230,Pam Grier,22.806,Commander Helena Braddock,2,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
3,2,976,Jason Statham,199.055,Sgt. Jericho Butler,3,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
4,1,20387,Clea DuVall,21.220,Bashira Kincaid,4,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307001,1,8857,Debbie Reynolds,20.281,Self (archive footage),8,844511,Sœur Sourire : Qui a tué la voix de Dieu ?,2021-06-11,fr,"[{'id': 99, 'name': 'Documentary'}, {'id': 104...",0,0,7.1
307002,1,164094,Marin Ireland,15.430,Anna,0,589496,Megafauna,2010-06-04,en,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",0,0,6.7
307003,2,58769,Stephen Colbert,4.737,Self (archive footage),12,376261,Weiner,2016-05-20,en,"[{'id': 99, 'name': 'Documentary'}]",0,0,7.6
307004,2,33663,Donald Trump,18.380,Self (archive footage),16,376261,Weiner,2016-05-20,en,"[{'id': 99, 'name': 'Documentary'}]",0,0,7.6


In [31]:
full_actors_df.to_csv('../Data/preprocessed_data/full_actors_df.csv', index=False)

In [39]:
# Add order and scaled_portion columns from tmdb_id2portions_df to full_actors_df using movie_id and actor_id
# full_actors_df has a column named 'order' and tmdb_id2portions_df has a column named 'cmu_order'
# if there is a match between movie_id and actor_id, add the order and scaled_portion columns to full_actors_df
# if there is no match, keep the value of order as the one in full_actors and perform this function to get scaled_portion get_portion_by_order_logscale(order)

def get_portion_by_order_logscale(order):
    return np.power(np.e, 0.7756578 - 0.04791*order)

# create column scaled_portion in full_actors_df by applying get_portion_by_order_logscale(order) to order
full_actors_df['scaled_portion'] = full_actors_df['order'].apply(get_portion_by_order_logscale)
# TODO if there is a match between movie_id and actor_id, replace the values of scaled_portion and order in full_actors_df with the ones in tmdb_id2portions_df


merged_df = full_actors_df.merge(tmdb_id2portions_df, on=['movie_id', 'actor_id'], how='left')


# Update values in full_actors_df with corresponding values from tmdb_id2portions_df
full_actors_df['scaled_portion'].update(merged_df['scaled_portion_y'])
full_actors_df['order'].update(merged_df['order_y'])

display(full_actors_df)

Unnamed: 0,gender,actor_id,name,popularity,character,order,movie_id,original_title,release_date,original_language,genres,budget,revenue,imdb_average_rating,scaled_portion
0,1,57395,Natasha Henstridge,49.626,Lt. Melanie Ballard,0,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9,54.665043
1,2,9778,Ice Cube,24.423,James 'Desolation' Williams,1,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9,5.700365
2,1,2230,Pam Grier,22.806,Commander Helena Braddock,2,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9,9.963459
3,2,976,Jason Statham,199.055,Sgt. Jericho Butler,3,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9,21.948843
4,1,20387,Clea DuVall,21.220,Bashira Kincaid,4,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9,0.219245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307001,1,8857,Debbie Reynolds,20.281,Self (archive footage),8,844511,Sœur Sourire : Qui a tué la voix de Dieu ?,2021-06-11,fr,"[{'id': 99, 'name': 'Documentary'}, {'id': 104...",0,0,7.1,1.480497
307002,1,164094,Marin Ireland,15.430,Anna,0,589496,Megafauna,2010-06-04,en,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",0,0,6.7,2.172020
307003,2,58769,Stephen Colbert,4.737,Self (archive footage),12,376261,Weiner,2016-05-20,en,"[{'id': 99, 'name': 'Documentary'}]",0,0,7.6,1.222304
307004,2,33663,Donald Trump,18.380,Self (archive footage),16,376261,Weiner,2016-05-20,en,"[{'id': 99, 'name': 'Documentary'}]",0,0,7.6,1.009139


In [48]:
# reorganize the dataframe by the name of the actor and the release date of the movie
full_actors_df = full_actors_df.sort_values(by=['name', 'release_date'])
full_actors_df.reset_index(drop=True, inplace=True)

# calculate the number of movies each actor has played in and add the column number_of_movies
number_of_movies = full_actors_df.groupby('name')['movie_id'].nunique()
number_of_movies_df = pd.DataFrame(number_of_movies)
number_of_movies_df.reset_index(inplace=True)
number_of_movies_df.rename(columns={'movie_id': 'number_of_movies'}, inplace=True)

# Group the actors by name and concatenate all their other features
grouped_actors_df = full_actors_df.groupby('name').agg({
    # get the first element of gender, since it is the same for all rows
    'gender': lambda x: x.iloc[0],
    'popularity': lambda x: list(x),
    'release_date': lambda x: list(x),
    'genres': lambda x: list(x),
    'budget': lambda x: list(x),
    'revenue': lambda x: list(x),
    'imdb_average_rating': lambda x: list(x),
    'order': lambda x: list(x),
    'scaled_portion': lambda x: list(x)
})

grouped_actors_df['career_length'] = grouped_actors_df['release_date'].apply(
    lambda x: (max(pd.to_datetime(x).year) - min(pd.to_datetime(x).year)) if len(x) >= 2 else 0)

# Add column number of movie from dataframe number_of_movies_df using the name as the key
grouped_actors_df = pd.merge(grouped_actors_df, number_of_movies_df, on='name', how='inner')
grouped_actors_df.reset_index(drop=True, inplace=True)
grouped_actors_df.to_csv('../Data/preprocessed_data/grouped_actors_db.csv', index=False)
# rename imdb_average_rating to imdb_ratings
grouped_actors_df = grouped_actors_df.rename(columns={'imdb_average_rating': 'imdb_ratings'})
display(grouped_actors_df)

Unnamed: 0,name,gender,popularity,release_date,genres,budget,revenue,imdb_ratings,order,scaled_portion,career_length,number_of_movies
0,"""Weird Al"" Yankovic",2,"[4.883, 13.081, 13.081, 13.081, 13.081, 5.609,...","[1985-09-25, 1988-10-21, 1988-12-02, 1989-07-2...","[[{'id': 35, 'name': 'Comedy'}, {'id': 10402, ...","[0, 3000000, 12000000, 5000000, 23000000, 0, 3...","[0, 343786, 78756177, 6157157, 86930411, 0, 51...","[7.5, 5.7, 7.6, 6.9, 6.9, 8.4, 6.5, 5.3, 8.9, ...","[0, 16, 11, 0, 32, 1, 35, 13, 0, 6, 3, 0, 0, 2...","[2.172020412396208, 0.2688172043010753, 0.0377...",38,52
1,50 Cent,2,"[10.69, 10.69, 10.69, 10.69, 10.69, 14.999, 11...","[2003-04-15, 2003-04-22, 2003-08-03, 2004-08-3...","[[{'id': 10402, 'name': 'Music'}, {'id': 99, '...","[0, 0, 0, 0, 0, 300000, 0, 40000000, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 46442528, 0, 0, 0, 0, 49...","[5.7, 4.4, 7.2, 6.8, 6.3, 7.4, 7.6, 5.4, 5.8, ...","[0, 0, 1, 1, 1, 1, 68, 0, 1, 0, 1, 2, 4, 11, 4...","[2.172020412396208, 2.172020412396208, 2.07041...",20,47
2,A Martinez,2,"[12.119, 15.752, 11.636, 12.119, 12.119, 12.11...","[1968-10-01, 1972-01-13, 1972-02-21, 1973-01-0...","[[], [{'id': 37, 'name': 'Western'}, {'id': 12...","[0, 6000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 7500000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5.5, 7.4, 7.1, 5.7, 5.8, 5.4, 6.1, 6.5, 7.5, ...","[10, 11, 11, 2, 0, 7, 9, 1, 5, 4, 2, 5, 6, 7, ...","[1.3452203114842156, 1.2822903308751898, 1.282...",54,53
3,A.C. Peterson,2,"[18.63, 7.378, 8.201, 18.63, 8.201, 10.299, 7....","[1984-12-26, 1985-07-28, 1986-11-29, 1987-09-2...","[[{'id': 18, 'name': 'Drama'}, {'id': 10749, '...","[11000000, 0, 0, 0, 0, 0, 0, 16000000, 0, 0, 0...","[4385312, 0, 0, 1733017, 0, 0, 0, 4409328, 465...","[6.1, 4.7, 5.7, 5.9, 5.5, 6.0, 5.0, 6.2, 5.3, ...","[55, 18, 15, 32, 9, 8, 14, 3, 11, 17, 17, 20, ...","[0.1557672769529184, 0.9169317839048822, 1.058...",37,65
4,A.J. Buckley,2,"[14.75, 9.869, 9.655, 14.75, 9.655, 9.655, 9.6...","[1998-07-24, 1999-05-18, 1999-10-17, 2000-07-1...","[[{'id': 9648, 'name': 'Mystery'}, {'id': 27, ...","[15000000, 0, 0, 15000000, 0, 0, 0, 5000000, 0...","[17514980, 0, 0, 5217498, 0, 0, 0, 0, 1, 0, 0,...","[5.6, 3.7, 5.9, 4.6, 4.3, 4.8, 6.5, 5.3, 5.7, ...","[11, 2, 3, 13, 6, 12, 9, 9, 2, 4, 6, 19, 4, 1,...","[1.1524073558716077, 1.9735575952260926, 1.881...",20,30
...,...,...,...,...,...,...,...,...,...,...,...,...
8051,Óscar Jaenada,2,"[11.329, 11.329, 9.549, 8.092, 11.329, 7.667, ...","[2001-01-05, 2003-09-12, 2003-09-26, 2004-08-2...","[[{'id': 10749, 'name': 'Romance'}, {'id': 18,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40000000,...","[0, 0, 959831, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8638...","[6.5, 6.1, 7.6, 4.7, 5.8, 4.2, 6.0, 6.2, 5.7, ...","[4, 6, 0, 0, 3, 1, 0, 0, 0, 2, 0, 0, 7, 9, 23,...","[1.7932288110394172, 1.6293771088922553, 2.172...",22,40
8052,Özcan Deniz,2,"[14.049, 12.583, 12.175, 12.175, 15.729, 12.17...","[2002-12-06, 2003-03-27, 2003-10-16, 2004-02-2...","[[{'id': 35, 'name': 'Comedy'}], [{'id': 35, '...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 5000000, 0, 0, 0, 0, 0, 0, 0, 0]","[4.6, 4.9, 3.6, 7.4, 1.5, 5.5, 5.2, 6.2, 5.0, ...","[2, 0, 0, 1, 1, 2, 0, 1, 0, 0, 5, 0, 1]","[1.9735575952260926, 0.2140026491817839, 2.172...",15,13
8053,Úrsula Corberó,1,"[53.548, 27.525, 27.525, 53.548, 27.525, 27.52...","[2011-12-28, 2013-02-05, 2013-04-06, 2013-11-1...","[[{'id': 27, 'name': 'Horror'}], [{'id': 35, '...","[4362560, 0, 0, 0, 0, 0, 869115, 0, 0, 0, 0, 8...","[0, 0, 0, 850259, 0, 0, 1018486, 0, 0, 0, 0, 3...","[3.7, 5.7, 3.8, 5.6, 5.8, 4.5, 6.6, 4.6, 5.5, ...","[1, 1, 0, 3, 4, 1, 7, 0, 2, 1, 0, 3]","[0.2674304152660129, 2.0704123700052235, 2.172...",10,12
8054,Şükrü Özyıldız,2,"[28.837, 24.124, 28.837, 28.837, 26.226, 24.12...","[2013-09-13, 2015-02-06, 2016-01-20, 2016-10-2...","[[{'id': 10749, 'name': 'Romance'}, {'id': 18,...","[0, 0, 0, 0, 1705235, 0, 0, 0, 0]","[0, 0, 0, 0, 3501037, 0, 0, 0, 0]","[4.7, 6.0, 5.3, 7.1, 7.0, 6.8, 5.3, 6.2, 4.9]","[1, 0, 1, 5, 9, 5, 1, 3, 1]","[2.0704123700052235, 2.172020412396208, 2.0704...",10,9


### Dividing datasets

In [108]:
# Divide the actors into three groups based on their career length
group1_df = grouped_actors_df[grouped_actors_df['career_length'] < 20]
group2_df = grouped_actors_df[(grouped_actors_df['career_length'] >= 20) & (grouped_actors_df['career_length'] < 40)]
group3_df = grouped_actors_df[grouped_actors_df['career_length'] >= 40]

In [122]:
# Defining constants 
avg_columns = ['budget', 'revenue', 'imdb_ratings', 'order', 'popularity', 'scaled_portion']
std_columns = ['order', 'budget', 'scaled_portion']

def genres_stats_per_period(row, start_column, end_column):
    """
    Calculates statistics related to movie genres for a specific actor during a specified time period.

    Parameters:
    - row (pd.Series): A row of a DataFrame containing information about an actor's movies.
    - start_column (str or None): The column representing the start date of the specified time period.
                                  If None, all movies are considered without a time constraint.
    - end_column (str): The column representing the end date of the specified time period.

    Returns:
    - list: A list containing two elements:
        1. The number of different genres an actor played in during the specified time period.
        2. The most frequent genre for the actor during the time period. If no genres are present, returns None.
    """

    # Initialize a dictionary to count the occurrences of each genre
    genres_counts = {}
    
    # Extract genres and release dates for movies associated with the actor
    genres_per_actor = row["genres"]
    release_dates_per_actor = row['release_date']
    
    # Iterate through movies and count genre occurrences based on the specified time period
    for genres_per_movie, release_date in zip(genres_per_actor, release_dates_per_actor):
        # For Group 1, start_column is missing as all members belong to the same time period
        if start_column is None or row[start_column] <= release_date <= row[end_column]:
            for genre in genres_per_movie:
                genres_counts[genre["name"]] = genres_counts.get(genre["name"], 0) + 1
    
    # Calculate genres diversity (number of different genres)
    n_genres = len(genres_counts)
    
    # Find the most frequent genre for an actor during the time period
    if n_genres > 0:
        main_genre = sorted(genres_counts.items(), key=lambda item: item[1])[-1][0]
    else:
        main_genre = None
    
    # Return a list with the calculated statistics
    return [n_genres, main_genre]

def calculate_average(df, col, iterations):
    """
    Calculates the average value for a specified column in a DataFrame over multiple time periods.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing information about actors and their movies.
    - col (str): The column for which the average is calculated.
    - iterations (int): The number of time periods to divide the actor's career.

    Returns:
    None
    """
    for i in range(iterations):
        df[f'avg_{col}_{i}'] = df.apply(lambda row: np.mean([val for val, date in zip(row[col], row['release_date']) if row[f'start_date_{i+1}'] <= date <= row[f'end_date_{i+1}']]), axis=1)
        
def calculate_std(df, col, iterations):
    """
    Calculates the standard deviation for a specified column in a DataFrame over multiple time periods.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing information about actors and their movies.
    - col (str): The column for which the standard deviation is calculated.
    - iterations (int): The number of time periods to divide the actor's career.

    Returns:
    None
    """
    for i in range(iterations):
        df[f'std_{col}_{i}'] = df.apply(lambda row: np.std([val for val, date in zip(row[col], row['release_date']) if row[f'start_date_{i+1}'] <= date <= row[f'end_date_{i+1}']]), axis=1)

def get_count_intervals(df, iterations):
    """
    Counts the number of movies within each time period for a DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing information about actors and their movies.
    - iterations (int): The number of time periods to divide the actor's career.

    Returns:
    None
    """
    for i in range(iterations):
        df[f'count_interval_{i+1}'] = df.apply(lambda row: sum(start <= date <= end for date in row['release_date'] for start, end in [(row[f'start_date_{i+1}'], row[f'end_date_{i+1}'])]), axis=1)


def initialize_date_helpers(df, iterations):
    """
    Initializes date-related helper columns for time period analysis in a DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing information about actors and their movies.
    - iterations (int): The number of time periods to divide the actor's career.

    Returns:
    None
    """
    # Convert release_date to a list of datetime objects
    df['release_date'] = df['release_date'].apply(lambda x: [datetime.strptime(date, "%Y-%m-%d") for date in x])
    # Calculate the time interval for each period
    df['interval'] = (df['career_length'] // iterations) * 365



In [123]:
# code for group 1
group1_divided_df = group1_df.copy()

# calculate mean from full_actors_df of revenue and budget
avg_revenue = full_actors_df['revenue'].mean()
avg_budget = full_actors_df['budget'].mean()

# replace 0 values in revenue and budget with the mean. each element in revenue or budget is a list of values
# if one of those values is 0, replace it with the mean
group1_divided_df['revenue'] = group1_divided_df['revenue'].apply(lambda x: [avg_revenue if val == 0 else val for val in x])
group1_divided_df['budget'] = group1_divided_df['budget'].apply(lambda x: [avg_budget if val == 0 else val for val in x])

# calculate the mean in average and revenue and replace the Nan and 0 values with the mean
group1_divided_df['avg_revenue_0'] = group1_divided_df['revenue'].apply(lambda x: np.mean(x) if isinstance(x, list) else x)
group1_divided_df['avg_budget_0'] = group1_divided_df['budget'].apply(lambda x: np.mean(x) if isinstance(x, list) else x)
group1_divided_df['std_budget_0'] = group1_divided_df['budget'].apply(lambda x: np.std(x) if isinstance(x, list) else x)
group1_divided_df['avg_imdb_rating_0'] = group1_divided_df['imdb_ratings'].apply(lambda x: np.mean(x) if isinstance(x, list) else x)
group1_divided_df['avg_order_0'] = group1_divided_df['order'].apply(lambda x: np.mean(x) if isinstance(x, list) else x)
group1_divided_df['std_order_0'] = group1_divided_df['order'].apply(lambda x: np.std(x) if isinstance(x, list) else x)
group1_divided_df['avg_popularity_0'] = group1_divided_df['popularity'].apply(lambda x: np.mean(x) if isinstance(x, list) else x)
group1_divided_df['avg_scaled_portion_0'] = group1_divided_df['scaled_portion'].apply(lambda x: np.mean(x) if isinstance(x, list) else x)


In [124]:
# calculate number of genres and main one for group 1
group1_divided_df[["n_genres", "main_genre"]] = group1_divided_df.apply(genres_stats_per_period, 
                                                                      start_column=None, end_column=None, 
                                                                      axis=1, result_type="expand")
 
group1_divided_df = group1_divided_df.drop(['release_date', 'budget', 'revenue', 'imdb_ratings', 'order', 'popularity', 'scaled_portion'], axis=1)

display(group1_divided_df)

Unnamed: 0,name,gender,genres,career_length,number_of_movies,avg_revenue_0,avg_budget_0,std_budget_0,avg_imdb_rating_0,avg_order_0,std_order_0,avg_popularity_0,avg_scaled_portion_0,n_genres,main_genre
7,Aaron Abrams,2,"[[{'id': 28, 'name': 'Action'}, {'id': 35, 'na...",17,32,3.119534e+07,1.512310e+07,1.632834e+07,6.059375,9.031250,14.092295,9.070594,2.844965,14,Drama
10,Aaron Dean Eisenberg,2,"[[{'id': 27, 'name': 'Horror'}, {'id': 878, 'n...",8,2,2.986808e+07,9.955799e+06,0.000000e+00,5.150000,2.500000,2.500000,15.179000,1.940681,4,Crime
11,Aaron Dessner,2,"[[{'id': 99, 'name': 'Documentary'}, {'id': 10...",14,4,2.243543e+07,9.955799e+06,0.000000e+00,8.025000,1.250000,0.829156,8.354000,2.047387,3,Music
14,Aaron Hill,2,"[[{'id': 35, 'name': 'Comedy'}], [{'id': 878, ...",8,11,1.025616e+08,2.629809e+07,5.526248e+07,5.072727,8.909091,6.598272,18.326818,2.624149,10,Comedy
15,Aaron Himelstein,2,"[[{'id': 18, 'name': 'Drama'}, {'id': 35, 'nam...",16,16,1.709818e+08,4.674076e+07,9.130570e+07,5.918750,15.687500,15.994994,17.487375,1.259751,13,Comedy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8050,Élodie Yung,1,"[[{'id': 18, 'name': 'Drama'}, {'id': 28, 'nam...",16,13,9.073018e+07,3.592574e+07,4.767198e+07,5.853846,8.615385,14.969397,22.092615,1.397435,9,Action
8052,Özcan Deniz,2,"[[{'id': 35, 'name': 'Comedy'}], [{'id': 35, '...",15,13,2.795515e+07,9.955799e+06,1.862645e-09,5.153846,1.000000,1.358732,12.655308,3.567478,7,Drama
8053,Úrsula Corberó,1,"[[{'id': 27, 'name': 'Horror'}], [{'id': 35, '...",10,12,2.563715e+07,1.523616e+07,2.211742e+07,5.425000,1.916667,1.977302,36.653333,1.839761,11,Comedy
8054,Şükrü Özyıldız,2,"[[{'id': 10749, 'name': 'Romance'}, {'id': 18,...",10,9,2.693841e+07,9.039070e+06,2.592902e+06,5.922222,2.888889,2.766644,27.499556,1.907203,8,Comedy


In [125]:
# code for group 2
group2_divided_df = group2_df.copy()

initialize_date_helpers(group2_divided_df, 2)

# Initialize columns for date ranges
group2_divided_df['start_date_1'] = group2_divided_df.apply(lambda row: min(row['release_date']), axis=1)
group2_divided_df['end_date_1'] = group2_divided_df.apply(lambda row: min(row['release_date']) + timedelta(days=row['interval']), axis=1)
group2_divided_df['start_date_2'] = group2_divided_df.apply(lambda row: min(row['release_date']) + timedelta(days=row['interval']), axis=1)
group2_divided_df['end_date_2'] = group2_divided_df.apply(lambda row: max(row['release_date']), axis=1)

get_count_intervals(group2_divided_df, 2)

# Define the lambda function as a constant
mean_transformer_group_2 = lambda x: [
        np.mean(x[:group2_divided_df['count_interval_1'].iloc[0]]) if (val == 0 or pd.isna(val)) else val 
        for val in x[:group2_divided_df['count_interval_1'].iloc[0]]
    ] + [ 
        np.mean(x[group2_divided_df['count_interval_1'].iloc[0]:]) if (val == 0 or pd.isna(val)) else val 
        for val in x[group2_divided_df['count_interval_1'].iloc[0]:]
    ] if isinstance(x, list) else x

group2_divided_df['revenue'] = group2_divided_df['revenue'].apply(mean_transformer_group_2)
group2_divided_df['budget'] = group2_divided_df['budget'].apply(mean_transformer_group_2)

for col in avg_columns:
    calculate_average(group2_divided_df, col, 2)

for col in std_columns:
    calculate_std(group2_divided_df, col, 2)


In [119]:
# calculate number of genres and main one for the first time period
group2_divided_df[["n_genres_0", "main_genre_0"]] = group2_divided_df.apply(genres_stats_per_period, 
                                              start_column="start_date_1", 
                                              end_column="end_date_1", 
                                              axis=1, result_type="expand")
# calculate number of genres and main one for the second time period
group2_divided_df[["n_genres_1", "main_genre_1"]] = group2_divided_df.apply(genres_stats_per_period, 
                                              start_column="start_date_2", 
                                              end_column="end_date_2", 
                                              axis=1, result_type="expand")
 
 # Drop unnecessary columns
group2_divided_df = group2_divided_df.drop(['release_date', 'budget', 'revenue', 'imdb_ratings', 'order', 'popularity', 'interval','scaled_portion'], axis=1)
display(group2_divided_df)

Unnamed: 0,name,gender,genres,career_length,number_of_movies,start_date_1,end_date_1,start_date_2,end_date_2,count_interval_1,...,std_order_0,std_order_1,std_budget_0,std_budget_1,std_scaled_portion_0,std_scaled_portion_1,n_genres_0,main_genre_0,n_genres_1,main_genre_1
0,"""Weird Al"" Yankovic",2,"[[{'id': 35, 'name': 'Comedy'}, {'id': 10402, ...",38,52,1985-09-25,2004-09-20,2004-09-20,2023-03-11,16,...,10.899935,14.854516,9.398928e+06,5.637931e+06,10.730810,0.604905,9.0,Comedy,12,Comedy
1,50 Cent,2,"[[{'id': 10402, 'name': 'Music'}, {'id': 99, '...",20,47,2003-04-15,2013-04-12,2013-04-12,2023-09-15,31,...,15.418410,10.734728,1.441294e+07,2.691620e+07,20.873328,0.494129,11.0,Drama,9,Action
3,A.C. Peterson,2,"[[{'id': 18, 'name': 'Drama'}, {'id': 10749, '...",37,65,1984-12-26,2002-12-22,2002-12-22,2021-11-25,31,...,10.213435,9.101931,1.566100e+07,1.789655e+07,5.552407,1.973435,13.0,Drama,14,Drama
4,A.J. Buckley,2,"[[{'id': 9648, 'name': 'Mystery'}, {'id': 27, ...",20,30,1998-07-24,2008-07-21,2008-07-21,2018-04-20,23,...,8.801315,3.374575,2.043894e+07,6.092845e+07,6.988732,15.322809,15.0,Drama,11,Drama
5,A.J. Cook,1,"[[{'id': 28, 'name': 'Action'}, {'id': 12, 'na...",22,20,1997-01-29,2008-01-27,2008-01-27,2019-04-09,14,...,4.778780,5.335937,8.573666e+06,4.099458e+06,12.372910,0.604581,12.0,Horror,7,Drama
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8039,Zoe McLellan,1,"[[{'id': 18, 'name': 'Drama'}], [{'id': 18, 'n...",25,15,1994-10-14,2006-10-11,2006-10-11,2019-11-22,9,...,11.333333,2.211083,1.375177e+07,3.726780e+05,4.897721,0.194322,9.0,Drama,7,Thriller
8041,Zoe Saldaña,1,"[[{'id': 18, 'name': 'Drama'}, {'id': 10402, '...",23,62,2000-05-12,2011-05-10,2011-05-10,2023-09-13,27,...,5.349000,17.543741,5.537850e+07,1.160016e+08,8.855613,3.884243,13.0,Comedy,16,Adventure
8043,Zooey Deschanel,1,"[[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'na...",24,47,1999-09-24,2011-09-21,2011-09-21,2023-10-12,35,...,3.652676,10.435982,2.532732e+07,4.035433e+07,9.075384,0.547771,15.0,Comedy,12,Music
8044,Zosia Mamet,1,"[[{'id': 10770, 'name': 'TV Movie'}, {'id': 18...",29,26,1994-08-04,2008-07-31,2008-07-31,2023-10-12,3,...,5.436502,8.239102,9.074537e+06,1.840423e+06,0.288632,1.354696,7.0,Drama,12,Comedy


In [None]:
group3_divided_df = group3_df.copy()

initialize_date_helpers(group3_divided_df, 3)

# Initialize columns for date ranges
group3_divided_df['start_date_1'] = group3_divided_df.apply(lambda row: min(row['release_date']), axis=1)
group3_divided_df['end_date_1'] = group3_divided_df.apply(lambda row: min(row['release_date']) + timedelta(days=row['interval']), axis=1)
group3_divided_df['start_date_2'] = group3_divided_df.apply(lambda row: min(row['release_date']) + timedelta(days=row['interval']), axis=1)
group3_divided_df['end_date_2'] = group3_divided_df.apply(lambda row: min(row['release_date']) + timedelta(days=row['interval'] * 2), axis=1)
group3_divided_df['start_date_3'] = group3_divided_df.apply(lambda row: min(row['release_date']) + timedelta(days=row['interval'] * 2 ), axis=1)
group3_divided_df['end_date_3'] = group3_divided_df.apply(lambda row: max(row['release_date']), axis=1)

# Count the number of release dates within each time slot
group3_divided_df['count_interval_1'] = group3_divided_df.apply(lambda row: sum(start <= date <= end for date in row['release_date'] for start, end in [(row['start_date_1'], row['end_date_1'])]), axis=1)
group3_divided_df['count_interval_2'] = group3_divided_df.apply(lambda row: sum(start <= date <= end for date in row['release_date'] for start, end in [(row['start_date_2'], row['end_date_2'])]), axis=1)
group3_divided_df['count_interval_3'] = group3_divided_df.apply(lambda row: sum(start <= date <= end for date in row['release_date'] for start, end in [(row['start_date_3'], row['end_date_3'])]), axis=1)

# Define the lambda function as a constant
mean_transformer_group_3 = lambda x: [
            np.mean(x[:group3_divided_df['count_interval_1'].iloc[0]]) if (val == 0 or pd.isna(val)) else val
            for val in x[:group3_divided_df['count_interval_1'].iloc[0]]
        ] + [
            np.mean(x[group3_divided_df['count_interval_1'].iloc[0]:group3_divided_df['count_interval_2'].iloc[0]])
            if (val == 0 or pd.isna(val)) else val
            for val in x[group3_divided_df['count_interval_1'].iloc[0]:group3_divided_df['count_interval_2'].iloc[0]]
        ] + [
            np.mean(x[group3_divided_df['count_interval_2'].iloc[0]:]) if (val == 0 or pd.isna(val)) else val
            for val in x[group3_divided_df['count_interval_2'].iloc[0]:]
        ] if isinstance(x, list) else x

group3_divided_df['revenue'] = group3_divided_df['revenue'].apply(mean_transformer_group_3)
group3_divided_df['budget'] = group3_divided_df['budget'].apply(mean_transformer_group_3)

# Calculate the average budget, revenue, IMDb rating, popularity and order for each period
for col in avg_columns:
    calculate_average(group3_divided_df, col, 3)

# Calculate the standard deviation of budget and order for each period
for col in std_columns:
    calculate_std(group3_divided_df, col, 3)

In [117]:
# calculate number of genres and main one for the each time period
group3_divided_df[["n_genres_0", "main_genre_0"]] = group3_divided_df.apply(genres_stats_per_period, 
                                              start_column="start_date_1", 
                                              end_column="end_date_1", 
                                              axis=1, result_type="expand")
group3_divided_df[["n_genres_1", "main_genre_1"]] = group3_divided_df.apply(genres_stats_per_period, 
                                              start_column="start_date_2", 
                                              end_column="end_date_2", 
                                              axis=1, result_type="expand")
group3_divided_df[["n_genres_2", "main_genre_2"]] = group3_divided_df.apply(genres_stats_per_period, 
                                              start_column="start_date_3", 
                                              end_column="end_date_3", 
                                              axis=1, result_type="expand")
 
# Drop unnecessary columns
group3_divided_df = group3_divided_df.drop(['release_date', 'budget', 'revenue', 'imdb_ratings', 'order', 'popularity', 'interval', 'scaled_portion'], axis=1)
display(group3_divided_df)


Unnamed: 0,name,gender,genres,career_length,number_of_movies,start_date_1,end_date_1,start_date_2,end_date_2,start_date_3,...,std_budget_2,std_scaled_portion_0,std_scaled_portion_1,std_scaled_portion_2,n_genres_0,main_genre_0,n_genres_1,main_genre_1,n_genres_2,main_genre_2
2,A Martinez,2,"[[], [{'id': 37, 'name': 'Western'}, {'id': 12...",54,53,1968-10-01,1986-09-27,1986-09-27,2004-09-22,2004-09-22,...,9.603345e+06,0.287090,8.464379,21.454497,11.0,Drama,12.0,Drama,12.0,Drama
6,Aamir Khan,2,"[[{'id': 10402, 'name': 'Music'}, {'id': 28, '...",49,61,1973-02-18,1989-02-14,1989-02-14,2005-02-10,2005-02-10,...,1.361822e+07,15.373898,26.233021,17.962109,4.0,Drama,11.0,Drama,13.0,Drama
31,Abby Dalton,1,"[[{'id': 80, 'name': 'Crime'}, {'id': 10402, '...",51,15,1957-04-01,1974-03-28,1974-03-28,1991-03-24,1991-03-24,...,3.755523e+04,0.635700,0.246530,0.218109,8.0,Drama,5.0,Science Fiction,4.0,Horror
35,Abe Vigoda,2,"[[{'id': 18, 'name': 'Drama'}], [{'id': 18, 'n...",43,44,1965-11-10,1979-11-07,1979-11-07,1993-11-03,1993-11-03,...,8.504267e+06,0.512652,0.775737,0.852583,8.0,TV Movie,14.0,Comedy,12.0,Comedy
42,Adam Arkin,2,"[[{'id': 18, 'name': 'Drama'}, {'id': 35, 'nam...",52,48,1969-01-01,1985-12-28,1985-12-28,2002-12-24,2002-12-24,...,1.598163e+07,0.389605,1.855158,0.469792,8.0,Comedy,11.0,Drama,10.0,Drama
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7955,Yorgo Voyagis,2,"[[{'id': 18, 'name': 'Drama'}], [{'id': 18, 'n...",57,38,1964-12-14,1983-12-10,1983-12-10,2002-12-05,2002-12-05,...,5.030202e+06,0.782055,0.512323,0.617594,9.0,Drama,15.0,Drama,4.0,Drama
7964,Yuen Wah,2,"[[{'id': 28, 'name': 'Action'}], [{'id': 18, '...",61,152,1962-12-05,1982-11-30,1982-11-30,2002-11-25,2002-11-25,...,3.080774e+07,1.661940,4.281629,0.626003,10.0,Action,12.0,Action,14.0,Action
7980,Yuko Tanaka,1,"[[{'id': 18, 'name': 'Drama'}], [{'id': 18, 'n...",42,36,1981-01-08,1995-01-05,1995-01-05,2009-01-01,2009-01-01,...,0.000000e+00,8.587393,6.738326,0.250652,7.0,Drama,5.0,Drama,5.0,Drama
8046,Zoë Wanamaker,1,"[[{'id': 18, 'name': 'Drama'}, {'id': 10752, '...",46,36,1973-04-19,1988-04-15,1988-04-15,2003-04-12,2003-04-12,...,6.867499e+06,0.257594,0.684702,0.891459,6.0,Drama,9.0,Drama,12.0,Drama


In [120]:
# save the dataframes into csv files 
group1_divided_df.to_csv('../Data/preprocessed_data/group1_divided_df.csv', index=True)
group2_divided_df.to_csv('../Data/preprocessed_data/group2_divided_df.csv', index=True)
group3_divided_df.to_csv('../Data/preprocessed_data/group3_divided_df.csv', index=True)
# merge all the dataframes into one
merged_grouped_actors = pd.concat([group1_divided_df, group2_divided_df, group3_divided_df])
merged_grouped_actors.to_csv('../Data/preprocessed_data/merged_grouped_actors_db.csv', index=True)

## 3. Plot visualization

In [126]:
import pandas as pd
full_actors_filtered_df = pd.read_csv('../Data/preprocessed_data/full_actors_filtered_db.csv')

group1_df = pd.read_csv('../Data/preprocessed_data/group1_divided_df.csv')
group2_df = pd.read_csv('../Data/preprocessed_data/group2_divided_df.csv')
group3_df = pd.read_csv('../Data/preprocessed_data/group3_divided_df.csv')

In [127]:
# Defining constants
rgba_light_yellow = (255, 255, 100, 0.5)
rgba_light_orange = (255, 165, 0, 0.5)
rgba_light_red = (240, 128, 128, 0.5)

# Defining functions
def create_px_figure(actor_movies_sorted: pd.DataFrame, actor_name: str) -> px.scatter:
    """
    Create a Plotly Express scatter plot figure based on actor movie data.

    Parameters:
    - actor_movies_sorted (pd.DataFrame): DataFrame containing sorted movie data for an actor.
    - actor_name (str): The name of the actor for whom the plot is generated.

    Returns:
    - px.scatter: A Plotly Express scatter plot figure.
    """
    # Create a scatter plot figure using Plotly Express
    fig = px.scatter(actor_movies_sorted, x='release_date', y='imdb_average_rating',
                     hover_data=['original_title', 'order'],
                     labels={'imdb_average_rating': 'IMDb Rating', 'release_date': 'Release Year', 'original_title': 'Movie name', 'order': 'Order'},
                     title=f'Career Evolution based on Movie Ratings for Actor {actor_name}')

    return fig


def update_figure_display(fig: go.Figure, actor_movies_sorted: pd.DataFrame, start_date, end_date):
    """
    Update the display of a Plotly figure based on actor movie data.

    Parameters:
    - fig (go.Figure): The Plotly figure to be updated.
    - actor_movies_sorted (pd.DataFrame): DataFrame containing sorted movie data for an actor.
    - start_date: The starting date for the x-axis range.
    - end_date: The ending date for the x-axis range.

    Returns:
    - None
    """
    # Set specific colors based on the order of the datapoint
    fig.update_traces(marker=dict(color=actor_movies_sorted['order'].map({
        0: 'red', 1: 'red',
        2: 'yellowgreen', 3: 'yellowgreen', 4: 'yellowgreen', 5: 'yellowgreen'
    })))

    # Set the range of the x-axis
    fig.update_layout(xaxis=dict(range=[start_date, end_date]))


def add_shape(fig: go.Figure, x_start, x_end, color: tuple):
    """
    Add a rectangular shape to the given Plotly figure.

    Parameters:
    - fig (go.Figure): The Plotly figure to which the shape will be added.
    - x_start: The starting x-coordinate of the rectangle.
    - x_end: The ending x-coordinate of the rectangle.
    - color (tuple): A tuple representing the color in the format (R, G, B, Alpha).

    Returns:
    - None
    """
    fig.add_shape(
        go.layout.Shape(
            type="rect",
            x0=x_start,
            x1=x_end,
            y0=0,
            y1=11,
            fillcolor=f"rgba{color}",
            opacity=0.5,
            layer="below",
            line_width=0,
        
        )
    )

def get_movies_sorted(actor_id: int) -> tuple:
    """
    Get sorted movies and actor name based on actor ID.

    Parameters:
    - actor_id (int): The unique identifier of the actor.

    Returns:
    - tuple: A tuple containing the sorted DataFrame of movies and the actor's name.
    """
    actor_movies_sorted = full_actors_df[full_actors_df['actor_id'] == actor_id].sort_values(by='release_date')
    actor_name = actor_movies_sorted['name'].values[0]

    return actor_movies_sorted, actor_name

def get_release_dates(actor_names: list, full_actors_filtered_df: pd.DataFrame) -> tuple:
    """
    Get the first and last release dates for movies of given actors.

    Parameters:
    - actor_names (list): A list of actor names for whom release dates are to be determined.
    - full_actors_filtered_df (pd.DataFrame): DataFrame containing movie data for all actors.

    Returns:
    - tuple: A tuple containing the first and last release dates in the format ('YYYY-MM-DD', 'YYYY-MM-DD').
    """
    # Get actor IDs for the given actor names
    actor_ids = [full_actors_df[full_actors_df['name'] == name]['actor_id'].iloc[0] for name in actor_names]

    # Get first and last release dates for each actor
    first_release_dates = [full_actors_df[full_actors_df['actor_id'] == actor_id]['release_date'].min() for actor_id in actor_ids]
    last_release_dates = [full_actors_df[full_actors_df['actor_id'] == actor_id]['release_date'].max() for actor_id in actor_ids]

    # Find the overall earliest and latest release dates
    first_release_date = min(first_release_dates)
    last_release_date = max(last_release_dates)

    # Adjust start and end dates by adding/subtracting 1 year
    start_date = datetime.strptime(first_release_date, '%Y-%m-%d') - timedelta(days=365)
    end_date = datetime.strptime(last_release_date, '%Y-%m-%d') + timedelta(days=365)

    # Convert back to string format
    first_release_date = start_date.strftime('%Y-%m-%d')
    last_release_date = end_date.strftime('%Y-%m-%d')

    return first_release_date, last_release_date

In [128]:
def plot_movies_ratings_1(actor_id, start_date, end_date):
    actor_movies_sorted, actor_name = get_movies_sorted(actor_id)
    fig = create_px_figure(actor_movies_sorted, actor_name)

    add_shape(fig, 
              actor_movies_sorted['release_date'].min(), 
              actor_movies_sorted['release_date'].max(),
              rgba_light_yellow)

    update_figure_display(fig, actor_movies_sorted, start_date, end_date)
    
    pio.write_html(fig, file='group1_example_career_evolution.html')
    
    fig.show()

In [129]:
def plot_movies_ratings_2(actor_id, start_date, end_date):
    actor_movies_sorted, actor_name = get_movies_sorted(actor_id)
    # get dates of separation
    date_middle = group2_df[group2_df['name'] == actor_name]['start_date_2'].values[0]
    
    fig = create_px_figure(actor_movies_sorted, actor_name)

    add_shape(fig,
                actor_movies_sorted['release_date'].min(),
                date_middle,
                rgba_light_yellow)

    add_shape(fig,
                date_middle,
                actor_movies_sorted['release_date'].max(),
                rgba_light_orange)


    update_figure_display(fig, actor_movies_sorted, start_date, end_date)

    # Save the figure as an HTML file
    pio.write_html(fig, file='group2_example_career_evolution.html')
    
    fig.show()


In [130]:
def plot_movies_ratings_3(actor_id, start_date, end_date):
    actor_movies_sorted, actor_name = get_movies_sorted(actor_id)
    # get dates of separation
    date_middle_1 = group3_df[group3_df['name'] == actor_name]['start_date_2'].values[0]
    date_middle_2 = group3_df[group3_df['name'] == actor_name]['start_date_3'].values[0]
    
    fig = create_px_figure(actor_movies_sorted, actor_name)

    add_shape(fig,
                actor_movies_sorted['release_date'].min(),
                date_middle_1,
                rgba_light_yellow)
    
    add_shape(fig,
                date_middle_1,
                date_middle_2,
                rgba_light_orange)

    add_shape(fig,
                date_middle_2,
                actor_movies_sorted['release_date'].max(),
                rgba_light_red)

    update_figure_display(fig, actor_movies_sorted, start_date, end_date)
    
    pio.write_html(fig, file='group3_example_career_evolution.html')
    
    fig.show()

In [131]:
# Example plots
actor_names = ['Zoe Lister-Jones', 'A.C. Peterson', 'Jackie Chan']
actor_ids = [full_actors_df[full_actors_df['name'] == name]['actor_id'].iloc[0] for name in actor_names]

first_release_date, last_release_date = get_release_dates(actor_names, full_actors_df)

plot_movies_ratings_1(actor_ids[0], first_release_date, last_release_date)
plot_movies_ratings_2(actor_ids[1], first_release_date, last_release_date)
plot_movies_ratings_3(actor_ids[2], first_release_date, last_release_date)