In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException

import time

In [2]:
url = 'https://today.yougov.com/ratings/entertainment/popularity/all-time-actors-actresses/all'
path = "Path" # Path to webdriver executable

chrome_options = Options()
chrome_options.binary_location = r"C:\Program Files\Google\Chrome\Application\chrome.exe"

# Use only chrome_options, not executable_path
driver = webdriver.Chrome(options=chrome_options)

driver.get(url)


# Decline cookies settings
try:
    decline_button = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, 'onetrust-reject-all-handler'))
    )

    decline_button.click()
except Exception as e:
    pass

# Scroll down to load more data
while True:
    # Scroll to the bottom of the page
    driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)

    # Wait for a short interval to allow the content to load
    time.sleep(2)

    # Check if there are more actors loaded
    try:
        load_more_button = WebDriverWait(driver, 100).until(
            EC.element_to_be_clickable((By.CLASS_NAME, 'load-more-button'))
        )
        
        load_more_button.click()
    except TimeoutException:
        break

html = driver.page_source
driver.quit()

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Element containing the list of actors
actors_container = soup.find('div', class_='rankings-entities-list-container')

# Actor's list items
actor_items = actors_container.find_all('li', class_='ng-star-inserted')

# Create lists to store data
actor_names = []
popularity_percentages = []
fame_percentages = []

for actor_item in actor_items:
    # Extract the actor name
    actor_name = actor_item.find('img', class_='ng-star-inserted')['alt'] if actor_item.find('img', class_='ng-star-inserted') else None

    # Extract popularity percentage
    popularity_element = actor_item.find('span', class_='compact')
    popularity_percentage = popularity_element.text.split('%')[0] if popularity_element else None

    # Extract fame percentage
    fame_element = actor_item.find('span', class_='rankings-item-active')
    fame_percentage = fame_element.text.split('%')[0] if fame_element else None

    actor_names.append(actor_name)
    popularity_percentages.append(popularity_percentage)
    fame_percentages.append(fame_percentage)

data = {
    'Actor Name': actor_names,
    'Popularity (%)': popularity_percentages,
    'Fame (%)': fame_percentages
}

famous_actors = pd.DataFrame(data)

print(famous_actors)
famous_actors.to_csv('../Data/preprocessed_data/famous_actors_db.csv', index=False)


KeyboardInterrupt: 

## Movie trajectory for actors

In [36]:
# extract a json file and transform it into a dataframe
import requests
import pandas as pd
import json

tmdb_id2credit = json.load(open('../Data/tmdb_resources/tmdb_id2credit_full.json'))
id2credit_df = pd.DataFrame.from_dict(tmdb_id2credit, orient='index')
id2credit_df = id2credit_df.drop(columns='crew')

tmdb_id2credit_imdb_rating = json.load(open('../Data/tmdb_resources/tmdb_id2detail_imdb_rating.json'))
imdb_rating_df = pd.DataFrame.from_dict(tmdb_id2credit_imdb_rating, orient='index')

In [None]:
columns_to_keep = ['id', 'original_title', 'release_date', 'original_language', 'genres', 'budget', 'revenue', 'imdb_average_rating']
imdb_rating_filtered_df = imdb_rating_df[columns_to_keep]
imdb_rating_filtered_df.reset_index(drop=True, inplace=True)

# Creating a dataframe with revenue that is different from 0
revenue_non_zero_df = imdb_rating_filtered_df[imdb_rating_filtered_df['revenue'] != 0]
print(f'The number of movies with revenue is {revenue_non_zero_df.shape[0]:,}')

# Creating a dataframe with imdb_average_rating different from NaN
rating_not_nan_df = imdb_rating_filtered_df.dropna(subset=['imdb_average_rating'])
print(f'The number of movies with imdb ratings is {rating_not_nan_df.shape[0]:,}')

# Display the updated DataFrame
display(imdb_rating_filtered_df)

The number of movies with revenue is 18,904
The number of movies with imdb ratings is 393,909


Unnamed: 0,id,original_title,release_date,original_language,genres,budget,revenue,imdb_average_rating
0,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
1,263493,The Gangsters,1913-05-29,en,"[{'id': 35, 'name': 'Comedy'}]",0,0,6.8
2,50944,The Sorcerer's Apprentice,2002-04-12,en,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",0,0,4.2
3,33592,White of the Eye,1987-06-19,en,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",0,0,6.1
4,43839,Alexander's Ragtime Band,1938-05-24,en,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",2000000,4000000,6.8
...,...,...,...,...,...,...,...,...
863077,1096279,Eva,2021-01-01,id,"[{'id': 99, 'name': 'Documentary'}]",0,0,
863078,1001995,勾魂令,2018-02-02,zh,"[{'id': 27, 'name': 'Horror'}]",0,0,
863079,823988,La Voix Humaine / L’Heure Espagnole,2021-03-26,en,[],0,0,
863080,1022210,How Science Changed Our World,2010-12-23,en,"[{'id': 99, 'name': 'Documentary'}]",0,0,


Cast preprocessing

In [None]:
# Assuming 'id' is the common column between imdb_rating_filtered_df and id2credits
merged_df = pd.merge(imdb_rating_filtered_df, id2credit_df, on='id', how='inner')

# Filter rows with either revenue not equal to 0 or imdb_average_rating not NaN
merged_df_filtered = merged_df[(merged_df['revenue'] != 0) | (~merged_df['imdb_average_rating'].isna())]
merged_df_filtered.reset_index(drop=True, inplace=True)
merged_df_filtered = merged_df_filtered.rename(columns={'id': 'movie_id'})


# Now, you can use merged_df_filtered for further analysis or display the desired information
print(f'The number of movies with revenue or imdb ratings is {merged_df_filtered.shape[0]:,}')
display(merged_df_filtered)

The number of movies with revenue or imdb ratings is 396,556


Unnamed: 0,movie_id,original_title,release_date,original_language,genres,budget,revenue,imdb_average_rating,cast
0,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9,"[{'adult': False, 'gender': 1, 'id': 57395, 'k..."
1,263493,The Gangsters,1913-05-29,en,"[{'id': 35, 'name': 'Comedy'}]",0,0,6.8,"[{'adult': False, 'gender': 2, 'id': 1086663, ..."
2,50944,The Sorcerer's Apprentice,2002-04-12,en,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",0,0,4.2,"[{'adult': False, 'gender': 1, 'id': 46948, 'k..."
3,33592,White of the Eye,1987-06-19,en,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",0,0,6.1,"[{'adult': False, 'gender': 2, 'id': 18181, 'k..."
4,43839,Alexander's Ragtime Band,1938-05-24,en,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",2000000,4000000,6.8,"[{'adult': False, 'gender': 2, 'id': 10922, 'k..."
...,...,...,...,...,...,...,...,...,...
396551,770135,Scandal in 97,2020-11-29,en,"[{'id': 53, 'name': 'Thriller'}]",0,0,4.9,"[{'adult': False, 'gender': 2, 'id': 1232652, ..."
396552,472471,Sniff!!!,2017-08-25,hi,"[{'id': 12, 'name': 'Adventure'}, {'id': 35, '...",0,0,5.3,"[{'adult': False, 'gender': 0, 'id': 1963881, ..."
396553,707790,マッハ’78,1978-02-25,ja,"[{'id': 28, 'name': 'Action'}]",0,0,6.1,"[{'adult': False, 'gender': 0, 'id': 2767640, ..."
396554,267899,El fuego inolvidable,2014-09-11,es,"[{'id': 18, 'name': 'Drama'}]",0,865,7.8,"[{'adult': False, 'gender': 2, 'id': 91251, 'k..."


Movie detail has revenue and imdb ratings   
Movie credits has movie-actor pair data  

**Objective**: Given an actor's id, return a plot with an x axis of the movies they appread in sorted by year and a y axis of the movies' ratings.


## Dividing datasets

Divide the dataset into 3 
Features:
- Career length
Averages:
- order
- budget
- revenue
- tmdb popularity of movies
- imdb ratings of movies
- tmb vote average

order, budget, revenue, career length, tmdb popularity of movies, tmdb vote average, number of movies, popularity of actor

In [None]:
# transform acting_in_2003_and_later.csv into a dataframe
import pandas as pd
actors_df = pd.read_csv('../Data/preprocessed_data/tmdb_acting_in_2003_and_later.csv')
display(actors_df)

Unnamed: 0,name,genre_mean_weighted,number_of_movies
0,Gary Oldman,['Crime'],57.0
1,Florence Pugh,['Mystery'],10.0
2,Jason Statham,['Science Fiction'],45.0
3,Jackie Chan,['Crime'],62.0
4,Scarlett Johansson,['Science Fiction'],51.0
...,...,...,...
8053,Alice Isaaz,['Romance'],6.0
8054,Peter Cullen,['Science Fiction'],18.0
8055,Mary Crosby,['Family'],3.0
8056,Daisuke Namikawa,['Mystery'],12.0


In [None]:

# Creating full_actors_df DataFrame
full_actors_list = []

for _, row in merged_df_filtered.iterrows():
    for actor_info in row['cast']:
        actor_entry = {
            'gender': actor_info['gender'],
            'actor_id': actor_info['id'],
            'name': actor_info['name'],
            'popularity': actor_info['popularity'],
            'character': actor_info['character'],
            'order': actor_info['order'],
            'movie_id': row['movie_id'],
            'original_title': row['original_title'],
            'release_date': row['release_date'],
            'original_language': row['original_language'],
            'genres': row['genres'],
            'budget': row['budget'],
            'revenue': row['revenue'],
            'imdb_average_rating': row['imdb_average_rating']
        }
        full_actors_list.append(actor_entry)

full_actors_df = pd.DataFrame(full_actors_list)

# Display the resulting full_actors_df
display(full_actors_df)


Unnamed: 0,gender,actor_id,name,popularity,character,order,movie_id,original_title,release_date,original_language,genres,budget,revenue,imdb_average_rating
0,1,57395,Natasha Henstridge,49.626,Lt. Melanie Ballard,0,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
1,2,9778,Ice Cube,24.423,James 'Desolation' Williams,1,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
2,1,2230,Pam Grier,22.806,Commander Helena Braddock,2,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
3,2,976,Jason Statham,199.055,Sgt. Jericho Butler,3,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
4,1,20387,Clea DuVall,21.220,Bashira Kincaid,4,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4009996,0,3836492,Cheryl Walde,0.600,,4,707790,マッハ’78,1978-02-25,ja,"[{'id': 28, 'name': 'Action'}]",0,0,6.1
4009997,2,91251,Luis Fernando Peña,3.955,Francisco,0,267899,El fuego inolvidable,2014-09-11,es,"[{'id': 18, 'name': 'Drama'}]",0,865,7.8
4009998,0,1316106,Alejandra Guinea,0.600,Adela,1,267899,El fuego inolvidable,2014-09-11,es,"[{'id': 18, 'name': 'Drama'}]",0,865,7.8
4009999,0,1316107,Mariana Urritia,0.600,Xochitl,2,267899,El fuego inolvidable,2014-09-11,es,"[{'id': 18, 'name': 'Drama'}]",0,865,7.8


In [None]:
#display(full_actors_df)
#full_actors_df.to_csv('../Data/preprocessed_data/full_actors_db.csv', index=False)
full_actors_df = pd.read_csv('../Data/preprocessed_data/full_actors_db.csv')
# display(full_actors_df)

Unnamed: 0,gender,actor_id,name,popularity,character,order,movie_id,original_title,release_date,original_language,genres,budget,revenue,imdb_average_rating
0,1,57395,Natasha Henstridge,49.626,Lt. Melanie Ballard,0,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
1,2,9778,Ice Cube,24.423,James 'Desolation' Williams,1,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
2,1,2230,Pam Grier,22.806,Commander Helena Braddock,2,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
3,2,976,Jason Statham,199.055,Sgt. Jericho Butler,3,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
4,1,20387,Clea DuVall,21.220,Bashira Kincaid,4,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4009996,0,3836492,Cheryl Walde,0.600,,4,707790,マッハ’78,1978-02-25,ja,"[{'id': 28, 'name': 'Action'}]",0,0,6.1
4009997,2,91251,Luis Fernando Peña,3.955,Francisco,0,267899,El fuego inolvidable,2014-09-11,es,"[{'id': 18, 'name': 'Drama'}]",0,865,7.8
4009998,0,1316106,Alejandra Guinea,0.600,Adela,1,267899,El fuego inolvidable,2014-09-11,es,"[{'id': 18, 'name': 'Drama'}]",0,865,7.8
4009999,0,1316107,Mariana Urritia,0.600,Xochitl,2,267899,El fuego inolvidable,2014-09-11,es,"[{'id': 18, 'name': 'Drama'}]",0,865,7.8


In [38]:
import pandas as pd
# remove rows where the imdb_average_rating is NaN
# full_actors_df = full_actors_df.dropna(subset=['imdb_average_rating'])
# # Filter full_actors_df to keep only the actors in actors_df
# full_actors_df = full_actors_df[full_actors_df['name'].isin(actors_df['name'])]

# full_actors_df.reset_index(drop=True, inplace=True)
# full_actors_df.to_csv('../Data/preprocessed_data/full_actors_filtered_db.csv', index=False)
full_actors_df = pd.read_csv('../Data/preprocessed_data/full_actors_filtered_db.csv')
display(full_actors_df)

# calculate the number of movies each actor has played in and add the column number_of_movies
number_of_movies = full_actors_df.groupby('name')['movie_id'].nunique()
number_of_movies_df = pd.DataFrame(number_of_movies)
number_of_movies_df.reset_index(inplace=True)
number_of_movies_df.rename(columns={'movie_id': 'number_of_movies'}, inplace=True)
display(number_of_movies_df)

Unnamed: 0,gender,actor_id,name,popularity,character,order,movie_id,original_title,release_date,original_language,genres,budget,revenue,imdb_average_rating
0,1,57395,Natasha Henstridge,49.626,Lt. Melanie Ballard,0,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
1,2,9778,Ice Cube,24.423,James 'Desolation' Williams,1,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
2,1,2230,Pam Grier,22.806,Commander Helena Braddock,2,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
3,2,976,Jason Statham,199.055,Sgt. Jericho Butler,3,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
4,1,20387,Clea DuVall,21.220,Bashira Kincaid,4,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307595,1,8857,Debbie Reynolds,20.281,Self (archive footage),8,844511,Sœur Sourire : Qui a tué la voix de Dieu ?,2021-06-11,fr,"[{'id': 99, 'name': 'Documentary'}, {'id': 104...",0,0,7.1
307596,1,164094,Marin Ireland,15.430,Anna,0,589496,Megafauna,2010-06-04,en,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",0,0,6.7
307597,2,58769,Stephen Colbert,4.737,Self (archive footage),12,376261,Weiner,2016-05-20,en,"[{'id': 99, 'name': 'Documentary'}]",0,0,7.6
307598,2,33663,Donald Trump,18.380,Self (archive footage),16,376261,Weiner,2016-05-20,en,"[{'id': 99, 'name': 'Documentary'}]",0,0,7.6


Unnamed: 0,name,number_of_movies
0,"""Weird Al"" Yankovic",52
1,50 Cent,47
2,A Martinez,54
3,A.C. Peterson,65
4,A.J. Buckley,30
...,...,...
8051,Óscar Jaenada,40
8052,Özcan Deniz,13
8053,Úrsula Corberó,12
8054,Şükrü Özyıldız,9


In [75]:
# suppress rows with empty release date
full_actors_df = full_actors_df[full_actors_df['release_date'].notna()]
full_actors_df = full_actors_df[full_actors_df['release_date'] != '']
full_actors_df = full_actors_df[full_actors_df['release_date'] != '[]']
# suppress rows where release date is not of type str
full_actors_df = full_actors_df[full_actors_df['release_date'].apply(lambda x: isinstance(x, str))]
full_actors_df.reset_index(drop=True, inplace=True)

# reorganize the dataframe by the name of the actor and the release date of the movie
full_actors_df = full_actors_df.sort_values(by=['name', 'release_date'])
full_actors_df.reset_index(drop=True, inplace=True)

full_actors_df = pd.read_csv('../Data/preprocessed_data/full_actors_filtered_db.csv')

display(full_actors_df)

Unnamed: 0,gender,actor_id,name,popularity,character,order,movie_id,original_title,release_date,original_language,genres,budget,revenue,imdb_average_rating
0,1,57395,Natasha Henstridge,49.626,Lt. Melanie Ballard,0,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
1,2,9778,Ice Cube,24.423,James 'Desolation' Williams,1,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
2,1,2230,Pam Grier,22.806,Commander Helena Braddock,2,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
3,2,976,Jason Statham,199.055,Sgt. Jericho Butler,3,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
4,1,20387,Clea DuVall,21.220,Bashira Kincaid,4,10016,Ghosts of Mars,2001-08-24,en,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",28000000,14010832,4.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307595,1,8857,Debbie Reynolds,20.281,Self (archive footage),8,844511,Sœur Sourire : Qui a tué la voix de Dieu ?,2021-06-11,fr,"[{'id': 99, 'name': 'Documentary'}, {'id': 104...",0,0,7.1
307596,1,164094,Marin Ireland,15.430,Anna,0,589496,Megafauna,2010-06-04,en,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",0,0,6.7
307597,2,58769,Stephen Colbert,4.737,Self (archive footage),12,376261,Weiner,2016-05-20,en,"[{'id': 99, 'name': 'Documentary'}]",0,0,7.6
307598,2,33663,Donald Trump,18.380,Self (archive footage),16,376261,Weiner,2016-05-20,en,"[{'id': 99, 'name': 'Documentary'}]",0,0,7.6


In [78]:
# regroup the actors by name and concatenate all their other features so that we have arrays in each one of them
import numpy as np
# Group the actors by name and concatenate all their other features
grouped_actors_df = full_actors_df.groupby('name').agg({
    # get the first element of gender, since it is the same for all rows
    'gender': lambda x: x.iloc[0],
    'popularity': lambda x: list(x),
    'release_date': lambda x: list(x),
    'genres': lambda x: list(x),
    'budget': lambda x: list(x),
    'revenue': lambda x: list(x),
    'imdb_average_rating': lambda x: list(x),
    'order': lambda x: list(x)
})

# remove empty or Nan values in the lists in release_date
grouped_actors_df['release_date'] = grouped_actors_df['release_date'].apply(
    lambda x: [i for i in x if str(i) != 'nan' and str(i) != '[]'])

# # sort values in release_date
# grouped_actors_df['release_date'] = grouped_actors_df['release_date'].apply(lambda x: sorted(x))

grouped_actors_df['career_length'] = grouped_actors_df['release_date'].apply(
    lambda x: (max(pd.to_datetime(x).year) - min(pd.to_datetime(x).year)) if len(x) >= 2 else 0)


# Add column number of movie from dataframe number_of_movies_df using the name as the key
grouped_actors_df = pd.merge(grouped_actors_df, number_of_movies_df, on='name', how='inner')
display(grouped_actors_df)

# get the actors with career length < 20
group1_df = grouped_actors_df[grouped_actors_df['career_length'] < 20]
# get the actors with career length >= 20 and < 40
group2_df = grouped_actors_df[(grouped_actors_df['career_length'] >= 20) & (grouped_actors_df['career_length'] < 40)]
# get the actors with career length >= 40
group3_df = grouped_actors_df[grouped_actors_df['career_length'] >= 40]


Unnamed: 0,name,gender,popularity,release_date,genres,budget,revenue,imdb_average_rating,order,career_length,number_of_movies
0,"""Weird Al"" Yankovic",2,"[13.081, 13.081, 5.609, 10.655, 13.081, 10.655...","[1996-05-24, 1988-10-21, 2003-02-09, 1999-11-2...","[[{'id': 35, 'name': 'Comedy'}, {'id': 28, 'na...","[18000000, 3000000, 0, 0, 30000000, 0, 5000000...","[27000000, 343786, 0, 0, 51132598, 0, 6157157,...","[5.3, 5.7, 8.7, 8.3, 6.5, 7.3, 6.9, 7.6, 4.3, ...","[13, 16, 0, 0, 35, 2, 0, 11, 6, 0, 13, 0, 8, 3...",38,52
1,50 Cent,2,"[14.999, 11.633, 10.69, 14.999, 10.69, 14.999,...","[2010-11-10, 2011-03-05, 2009-07-13, 2012-08-1...","[[{'id': 18, 'name': 'Drama'}, {'id': 35, 'nam...","[40000000, 0, 0, 11000000, 0, 0, 10000000, 100...","[60040976, 0, 0, 0, 0, 2566717, 0, 0, 0, 79498...","[6.5, 5.4, 4.4, 4.6, 4.4, 5.6, 3.8, 4.4, 6.8, ...","[59, 1, 1, 0, 0, 2, 0, 0, 1, 4, 0, 1, 0, 0, 0,...",20,47
2,A Martinez,2,"[15.752, 11.636, 15.752, 11.636, 11.636, 11.63...","[1989-12-08, 1980-04-28, 2011-01-29, 2000-01-2...","[[{'id': 35, 'name': 'Comedy'}], [{'id': 18, '...","[20000000, 0, 500000, 0, 0, 0, 11800000, 0, 0,...","[15400000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[5.7, 6.2, 3.0, 6.8, 6.7, 2.1, 5.7, 7.2, 5.8, ...","[6, 6, 2, 11, 4, 1, 5, 0, 1, 3, 3, 7, 15, 5, 1...",54,54
3,A.C. Peterson,2,"[18.63, 18.63, 18.63, 18.63, 18.63, 7.26, 18.6...","[1984-12-26, 1987-09-25, 2009-09-12, 1991-05-0...","[[{'id': 18, 'name': 'Drama'}, {'id': 10749, '...","[11000000, 0, 3500000, 0, 6500000, 0, 61000000...","[4385312, 1733017, 44462, 4654288, 12633747, 0...","[6.1, 5.9, 6.7, 5.3, 7.1, 5.4, 7.1, 6.0, 5.3, ...","[55, 32, 8, 11, 12, 18, 10, 8, 4, 5, 13, 10, 2...",37,65
4,A.J. Buckley,2,"[14.75, 9.655, 14.75, 14.75, 9.655, 14.75, 14....","[2007-01-01, 2001-02-16, 2006-02-05, 2000-07-1...","[[{'id': 28, 'name': 'Action'}, {'id': 53, 'na...","[1000000, 0, 1000000, 15000000, 0, 1000000, 15...","[0, 0, 0, 5217498, 0, 464000, 17514980, 0, 0, ...","[4.6, 6.5, 5.8, 4.6, 3.7, 6.6, 5.6, 5.2, 5.3, ...","[1, 9, 4, 13, 5, 4, 11, 4, 9, 6, 1, 42, 7, 3, ...",20,30
...,...,...,...,...,...,...,...,...,...,...,...
8051,Óscar Jaenada,2,"[9.549, 9.549, 11.329, 9.549, 9.549, 11.329, 7...","[2008-12-12, 2011-05-15, 2009-05-01, 2012-04-0...","[[{'id': 18, 'name': 'Drama'}, {'id': 36, 'nam...","[40000000, 379000000, 0, 20000000, 25000000, 0...","[8638163, 1045713802, 0, 16863583, 23580000, 0...","[6.8, 6.6, 6.2, 4.9, 6.2, 6.1, 4.2, 6.0, 5.2, ...","[7, 12, 9, 11, 5, 6, 1, 0, 1, 0, 0, 0, 23, 0, ...",22,40
8052,Özcan Deniz,2,"[12.175, 12.583, 12.175, 12.175, 12.175, 12.58...","[2004-02-20, 2003-03-27, 2012-11-02, 2003-10-1...","[[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'na...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5000000]","[7.4, 4.9, 5.0, 3.6, 5.7, 6.2, 6.1, 5.2, 6.0, ...","[1, 0, 0, 0, 0, 1, 0, 0, 1, 2, 2, 5, 1]",15,13
8053,Úrsula Corberó,1,"[53.548, 27.525, 28.748, 53.548, 27.525, 53.54...","[2011-12-28, 2015-04-24, 2017-01-17, 2016-02-1...","[[{'id': 27, 'name': 'Horror'}], [{'id': 35, '...","[4362560, 0, 0, 869115, 0, 88000000, 0, 0, 0, ...","[0, 0, 0, 1018486, 0, 36964325, 0, 0, 850259, ...","[3.7, 4.5, 4.6, 6.6, 5.8, 5.4, 5.7, 3.8, 5.6, ...","[1, 1, 0, 7, 4, 3, 1, 0, 3, 0, 1, 2]",10,12
8054,Şükrü Özyıldız,2,"[28.837, 24.124, 28.837, 26.226, 28.837, 24.12...","[2016-10-28, 2015-02-06, 2023-07-14, 2018-01-0...","[[{'id': 18, 'name': 'Drama'}, {'id': 35, 'nam...","[0, 0, 0, 1705235, 0, 0, 0, 0, 0]","[0, 0, 0, 3501037, 0, 0, 0, 0, 0]","[7.1, 6.0, 4.9, 7.0, 6.2, 6.8, 5.3, 4.7, 5.3]","[5, 0, 1, 9, 3, 5, 1, 1, 1]",10,9


In [67]:
# code for group 1
group1_divided_df = group1_df.copy()

# calculate the mean in average and revenue and replace the Nan and 0 values with the mean
group1_divided_df['avg_revenue_0'] = group1_divided_df['revenue'].apply(lambda x: np.mean(x) if isinstance(x, list) else x)
group1_divided_df['avg_budget_0'] = group1_divided_df['budget'].apply(lambda x: np.mean(x) if isinstance(x, list) else x)
group1_divided_df['avg_imdb_rating_0'] = group1_divided_df['imdb_average_rating'].apply(lambda x: np.mean(x) if isinstance(x, list) else x)
group1_divided_df['avg_order_0'] = group1_divided_df['order'].apply(lambda x: np.mean(x) if isinstance(x, list) else x)
group1_divided_df['avg_popularity_0'] = group1_divided_df['popularity'].apply(lambda x: np.mean(x) if isinstance(x, list) else x)

group1_divided_df = group1_divided_df.drop(['release_date', 'budget', 'revenue', 'imdb_average_rating', 'order', 'popularity'], axis=1)

display(group1_divided_df)


Unnamed: 0,name,gender,genres,career_length,number_of_movies,avg_revenue_0,avg_budget_0,avg_imdb_rating_0,avg_order_0,avg_popularity_0
7,Aaron Abrams,2,"[[{'id': 18, 'name': 'Drama'}], [{'id': 18, 'n...",17.0,32,9.727650e+06,7.656250e+06,6.059375,9.031250,9.070594
10,Aaron Dean Eisenberg,2,"[[{'id': 27, 'name': 'Horror'}, {'id': 878, 'n...",8.0,2,0.000000e+00,0.000000e+00,5.150000,2.500000,15.179000
11,Aaron Dessner,2,"[[{'id': 99, 'name': 'Documentary'}, {'id': 10...",14.0,4,3.436500e+04,0.000000e+00,8.025000,1.250000,8.354000
14,Aaron Hill,2,"[[{'id': 27, 'name': 'Horror'}], [{'id': 878, ...",8.0,11,8.083936e+07,2.177273e+07,5.072727,8.909091,18.326818
15,Aaron Himelstein,2,"[[{'id': 35, 'name': 'Comedy'}], [{'id': 18, '...",16.0,16,1.541810e+08,4.114062e+07,5.918750,15.687500,17.487375
...,...,...,...,...,...,...,...,...,...,...
8050,Élodie Yung,1,"[[{'id': 53, 'name': 'Thriller'}, {'id': 80, '...",16.0,13,7.234982e+07,3.133075e+07,5.853846,8.615385,22.092615
8052,Özcan Deniz,2,"[[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'na...",15.0,13,3.846154e+05,0.000000e+00,5.153846,1.000000,12.655308
8053,Úrsula Corberó,1,"[[{'id': 27, 'name': 'Horror'}], [{'id': 35, '...",10.0,12,3.236089e+06,7.769306e+06,5.425000,1.916667,36.653333
8054,Şükrü Özyıldız,2,"[[{'id': 18, 'name': 'Drama'}, {'id': 35, 'nam...",10.0,9,3.890041e+05,1.894706e+05,5.922222,2.888889,27.499556


In [79]:
from datetime import datetime, timedelta

# code for group 2
df = group2_df.copy()
# Convert release_date to a list of datetime objects
df['release_date'] = df['release_date'].apply(lambda x: [datetime.strptime(date, "%Y-%m-%d") for date in x])

# Calculate the time interval for each period
df['interval'] = (df['career_length'] // 2) * 365

# Initialize columns for date ranges
df['start_date_1'] = df.apply(lambda row: min(row['release_date']), axis=1)
df['end_date_1'] = df.apply(lambda row: min(row['release_date']) + timedelta(days=row['interval']), axis=1)
df['start_date_2'] = df.apply(lambda row: min(row['release_date']) + timedelta(days=row['interval']), axis=1)
df['end_date_2'] = df.apply(lambda row: max(row['release_date']), axis=1)

# Count the number of release dates within each time slot
df['count_interval_1'] = df.apply(lambda row: sum(start <= date <= end for date in row['release_date'] for start, end in [(row['start_date_1'], row['end_date_1'])]), axis=1)
df['count_interval_2'] = df.apply(lambda row: sum(start <= date <= end for date in row['release_date'] for start, end in [(row['start_date_2'], row['end_date_2'])]), axis=1)

# calculate the mean of the first count_interval_1 movies in revenue and replace the Nan and 0 values with the mean in these count_interval_1 movies
df['revenue'] = df['revenue'].apply(
    lambda x: [ np.mean(x[:df['count_interval_1'].iloc[0]]) if (val == 0 or pd.isna(val)) else val for val in x[:df['count_interval_1'].iloc[0]]] +
    [ np.mean(x[df['count_interval_1'].iloc[0]:]) if (val == 0 or pd.isna(val)) else val for val in x[df['count_interval_1'].iloc[0]:]] 
    if isinstance(x, list) else x
)

df['budget'] = df['budget'].apply(
    lambda x: [ np.mean(x[:df['count_interval_1'].iloc[0]]) if (val == 0 or pd.isna(val)) else val for val in x[:df['count_interval_1'].iloc[0]] ] +
    [ np.mean(x[df['count_interval_1'].iloc[0]:]) if (val == 0 or pd.isna(val)) else val for val in x[df['count_interval_1'].iloc[0]:]] 
    if isinstance(x, list) else x
)

# Calculate the average budget, revenue, IMDb rating, popularity and order for each period
df['avg_budget_0'] = df.apply(lambda row: np.mean([budget for budget, date in zip(row['budget'], row['release_date']) if row['start_date_1'] <= date <= row['end_date_1']]), axis=1)
df['avg_budget_1'] = df.apply(lambda row: np.mean([budget for budget, date in zip(row['budget'], row['release_date']) if row['start_date_2'] <= date <= row['end_date_2']]), axis=1)
df['avg_revenue_0'] = df.apply(lambda row: np.mean([revenue for revenue, date in zip(row['revenue'], row['release_date']) if row['start_date_1'] <= date <= row['end_date_1']]), axis=1)
df['avg_revenue_1'] = df.apply(lambda row: np.mean([revenue for revenue, date in zip(row['revenue'], row['release_date']) if row['start_date_2'] <= date <= row['end_date_2']]), axis=1)
# replace values
df['avg_imdb_rating_0'] = df.apply(lambda row: np.mean([imdb_rating for imdb_rating, date in zip(row['imdb_average_rating'], row['release_date']) if row['start_date_1'] <= date <= row['end_date_1']]), axis=1)
df['avg_imdb_rating_1'] = df.apply(lambda row: np.mean([imdb_rating for imdb_rating, date in zip(row['imdb_average_rating'], row['release_date']) if row['start_date_2'] <= date <= row['end_date_2']]), axis=1)
df['avg_order_0'] = df.apply(lambda row: np.mean([order for order, date in zip(row['order'], row['release_date']) if row['start_date_1'] <= date <= row['end_date_1']]), axis=1)
df['avg_order_1'] = df.apply(lambda row: np.mean([order for order, date in zip(row['order'], row['release_date']) if row['start_date_2'] <= date <= row['end_date_2']]), axis=1)
df['avg_popularity_0'] = df.apply(lambda row: np.mean([popularity for popularity, date in zip(row['popularity'], row['release_date']) if row['start_date_1'] <= date <= row['end_date_1']]), axis=1)
df['avg_popularity_1'] = df.apply(lambda row: np.mean([popularity for popularity, date in zip(row['popularity'], row['release_date']) if row['start_date_2'] <= date <= row['end_date_2']]), axis=1)

# Drop unnecessary columns
df = df.drop(['release_date', 'budget', 'revenue', 'imdb_average_rating', 'order', 'popularity'], axis=1)

display(df)
group2_divided_df = df.copy()

Unnamed: 0,name,gender,genres,career_length,number_of_movies,interval,start_date_1,end_date_1,start_date_2,end_date_2,...,avg_budget_0,avg_budget_1,avg_revenue_0,avg_revenue_1,avg_imdb_rating_0,avg_imdb_rating_1,avg_order_0,avg_order_1,avg_popularity_0,avg_popularity_1
0,"""Weird Al"" Yankovic",2,"[[{'id': 35, 'name': 'Comedy'}, {'id': 28, 'na...",38,52,6935,1985-09-25,2004-09-20,2004-09-20,2023-03-11,...,8.746094e+06,3.531250e+06,2.379055e+07,4.214668e+06,6.937500,6.852778,7.937500,11.305556,8.860250,7.113139
1,50 Cent,2,"[[{'id': 18, 'name': 'Drama'}, {'id': 35, 'nam...",20,47,3650,2003-04-15,2013-04-12,2013-04-12,2023-09-15,...,1.535717e+07,2.884480e+07,2.027896e+07,5.764001e+07,5.670968,6.443750,5.580645,8.125000,12.576419,13.711375
3,A.C. Peterson,2,"[[{'id': 18, 'name': 'Drama'}, {'id': 10749, '...",37,65,6570,1984-12-26,2002-12-22,2002-12-22,2021-11-25,...,1.196206e+07,1.057030e+07,9.527056e+06,1.315087e+07,5.648387,5.329412,15.516129,8.911765,11.268129,11.211382
4,A.J. Buckley,2,"[[{'id': 28, 'name': 'Action'}, {'id': 53, 'na...",20,30,3650,1998-07-24,2008-07-21,2008-07-21,2018-04-20,...,1.292717e+07,3.205204e+07,3.601965e+07,6.466198e+07,5.039130,5.457143,7.565217,4.428571,12.783000,13.308571
5,A.J. Cook,1,"[[{'id': 35, 'name': 'Comedy'}], [{'id': 35, '...",22,20,4015,1997-01-29,2008-01-27,2008-01-27,2019-04-09,...,6.796429e+06,4.245833e+06,1.299597e+07,3.799973e+06,4.992857,5.883333,4.857143,4.166667,20.006143,17.360167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8039,Zoe McLellan,1,"[[{'id': 18, 'name': 'Drama'}, {'id': 12, 'nam...",25,15,4380,1994-10-14,2006-10-11,2006-10-11,2019-11-22,...,1.080741e+07,3.888889e+06,1.981488e+07,8.492089e+06,5.655556,5.083333,11.666667,2.666667,12.229889,11.396500
8041,Zoe Saldaña,1,"[[{'id': 28, 'name': 'Action'}, {'id': 53, 'na...",23,62,4015,2000-05-12,2011-05-10,2011-05-10,2023-09-13,...,5.293969e+07,1.023756e+08,2.636940e+08,4.470251e+08,6.025926,6.994286,4.592593,8.600000,65.552185,65.861600
8043,Zooey Deschanel,1,"[[{'id': 35, 'name': 'Comedy'}, {'id': 10749, ...",24,47,4380,1999-09-24,2011-09-21,2011-09-21,2023-10-12,...,2.614053e+07,2.937707e+07,5.594943e+07,5.483799e+07,6.551429,6.166667,3.171429,8.083333,44.876086,36.110167
8044,Zosia Mamet,1,"[[{'id': 27, 'name': 'Horror'}], [{'id': 35, '...",29,26,5110,1994-08-04,2008-07-31,2008-07-31,2023-10-12,...,7.387500e+06,1.511413e+06,4.753250e+06,7.665798e+06,5.533333,6.182609,19.333333,5.173913,21.816000,22.699696


In [80]:
from datetime import datetime, timedelta

df = group3_df.copy()
# Convert release_date to a list of datetime objects
df['release_date'] = df['release_date'].apply(lambda x: [datetime.strptime(date, "%Y-%m-%d") for date in x])

# Calculate the time interval for each period
df['interval'] = (df['career_length'] // 3) * 365

# Initialize columns for date ranges
df['start_date_1'] = df.apply(lambda row: min(row['release_date']), axis=1)
df['end_date_1'] = df.apply(lambda row: min(row['release_date']) + timedelta(days=row['interval']), axis=1)
df['start_date_2'] = df.apply(lambda row: min(row['release_date']) + timedelta(days=row['interval']), axis=1)
df['end_date_2'] = df.apply(lambda row: min(row['release_date']) + timedelta(days=row['interval'] * 2), axis=1)
df['start_date_3'] = df.apply(lambda row: min(row['release_date']) + timedelta(days=row['interval'] * 2 ), axis=1)
df['end_date_3'] = df.apply(lambda row: max(row['release_date']), axis=1)

# Count the number of release dates within each time slot
df['count_interval_1'] = df.apply(lambda row: sum(start <= date <= end for date in row['release_date'] for start, end in [(row['start_date_1'], row['end_date_1'])]), axis=1)
df['count_interval_2'] = df.apply(lambda row: sum(start <= date <= end for date in row['release_date'] for start, end in [(row['start_date_2'], row['end_date_2'])]), axis=1)
df['count_interval_3'] = df.apply(lambda row: sum(start <= date <= end for date in row['release_date'] for start, end in [(row['start_date_3'], row['end_date_3'])]), axis=1)

# Assuming df['count_interval_1'] and df['count_interval_2'] are available
df['revenue'] = df['revenue'].apply(
    lambda x: [ np.mean(x[:df['count_interval_1'].iloc[0]]) if (val == 0 or pd.isna(val)) else val for val in x[:df['count_interval_1'].iloc[0]]] +
    [ np.mean(x[df['count_interval_1'].iloc[0]:df['count_interval_2'].iloc[0]]) if (val == 0 or pd.isna(val)) else val for val in x[df['count_interval_1'].iloc[0]:df['count_interval_2'].iloc[0]]] +
    [ np.mean(x[df['count_interval_2'].iloc[0]:]) if (val == 0 or pd.isna(val)) else val for val in x[df['count_interval_2'].iloc[0]:]]
    if isinstance(x, list) else x
)
df['budget'] = df['budget'].apply(
    lambda x: [ np.mean(x[:df['count_interval_1'].iloc[0]]) if (val == 0 or pd.isna(val)) else val for val in x[:df['count_interval_1'].iloc[0]]] +
    [ np.mean(x[df['count_interval_1'].iloc[0]:df['count_interval_2'].iloc[0]]) if (val == 0 or pd.isna(val)) else val for val in x[df['count_interval_1'].iloc[0]:df['count_interval_2'].iloc[0]] ] +
    [ np.mean(x[df['count_interval_2'].iloc[0]:]) if (val == 0 or pd.isna(val)) else val for val in x[df['count_interval_2'].iloc[0]:] ] 
    if isinstance(x, list) else x
)


# Calculate the average budget, revenue, IMDb rating, popularity and order for each period
df['avg_budget_0'] = df.apply(lambda row: np.mean([budget for budget, date in zip(row['budget'], row['release_date']) if row['start_date_1'] <= date <= row['end_date_1']]), axis=1)
df['avg_budget_1'] = df.apply(lambda row: np.mean([budget for budget, date in zip(row['budget'], row['release_date']) if row['start_date_2'] <= date <= row['end_date_2']]), axis=1)
df['avg_budget_2'] = df.apply(lambda row: np.mean([budget for budget, date in zip(row['budget'], row['release_date']) if row['start_date_3'] <= date <= row['end_date_3']]), axis=1)
df['avg_revenue_0'] = df.apply(lambda row: np.mean([revenue for revenue, date in zip(row['revenue'], row['release_date']) if row['start_date_1'] <= date <= row['end_date_1']]), axis=1)
df['avg_revenue_1'] = df.apply(lambda row: np.mean([revenue for revenue, date in zip(row['revenue'], row['release_date']) if row['start_date_2'] <= date <= row['end_date_2']]), axis=1)
df['avg_revenue_2'] = df.apply(lambda row: np.mean([revenue for revenue, date in zip(row['revenue'], row['release_date']) if row['start_date_3']<= date <= row['end_date_3']]), axis=1)
df['avg_imdb_rating_0'] = df.apply(lambda row: np.mean([imdb_rating for imdb_rating, date in zip(row['imdb_average_rating'], row['release_date']) if row['start_date_1'] <= date <= row['end_date_1']]), axis=1)
df['avg_imdb_rating_1'] = df.apply(lambda row: np.mean([imdb_rating for imdb_rating, date in zip(row['imdb_average_rating'], row['release_date']) if row['start_date_2'] <= date <= row['end_date_2']]), axis=1)
df['avg_imdb_rating_2'] = df.apply(lambda row: np.mean([imdb_rating for imdb_rating, date in zip(row['imdb_average_rating'], row['release_date']) if row['start_date_3']<= date <= row['end_date_3']]), axis=1)
df['avg_order_0'] = df.apply(lambda row: np.mean([order for order, date in zip(row['order'], row['release_date']) if row['start_date_1'] <= date <= row['end_date_1']]), axis=1)
df['avg_order_1'] = df.apply(lambda row: np.mean([order for order, date in zip(row['order'], row['release_date']) if row['start_date_2'] <= date <= row['end_date_2']]), axis=1)
df['avg_order_2'] = df.apply(lambda row: np.mean([order for order, date in zip(row['order'], row['release_date']) if row['start_date_3'] <= date <= row['end_date_3']]), axis=1)
df['avg_popularity_0'] = df.apply(lambda row: np.mean([popularity for popularity, date in zip(row['popularity'], row['release_date']) if row['start_date_1'] <= date <= row['end_date_1']]), axis=1)
df['avg_popularity_1'] = df.apply(lambda row: np.mean([popularity for popularity, date in zip(row['popularity'], row['release_date']) if row['start_date_2'] <= date <= row['end_date_2']]), axis=1)
df['avg_popularity_2'] = df.apply(lambda row: np.mean([popularity for popularity, date in zip(row['popularity'], row['release_date']) if row['start_date_3'] <= date <= row['end_date_3']]), axis=1)

# Drop unnecessary columns
df = df.drop(['release_date', 'budget', 'revenue', 'imdb_average_rating', 'order', 'popularity'], axis=1)
df = df.drop(['interval'], axis=1)

group3_divided_df = df.copy()
display(group3_divided_df)



Mean of empty slice.


invalid value encountered in scalar divide


Mean of empty slice.


invalid value encountered in scalar divide


Mean of empty slice.


invalid value encountered in scalar divide


Mean of empty slice.


invalid value encountered in scalar divide


Mean of empty slice.


invalid value encountered in scalar divide



Unnamed: 0,name,gender,genres,career_length,number_of_movies,start_date_1,end_date_1,start_date_2,end_date_2,start_date_3,...,avg_revenue_2,avg_imdb_rating_0,avg_imdb_rating_1,avg_imdb_rating_2,avg_order_0,avg_order_1,avg_order_2,avg_popularity_0,avg_popularity_1,avg_popularity_2
2,A Martinez,2,"[[{'id': 35, 'name': 'Comedy'}], [{'id': 18, '...",54,54,1968-10-01,1986-09-27,1986-09-27,2004-09-22,2004-09-22,...,4.824756e+06,5.781250,6.159091,5.340000,4.812500,4.818182,3.866667,12.934750,12.394864,12.983600
6,Aamir Khan,2,"[[{'id': 28, 'name': 'Action'}, {'id': 12, 'na...",49,61,1973-02-18,1989-02-14,1989-02-14,2005-02-10,2005-02-10,...,4.294828e+07,6.725000,6.322581,7.088462,3.250000,0.548387,3.730769,15.212750,18.546516,20.704615
31,Abby Dalton,1,"[[{'id': 12, 'name': 'Adventure'}], [{'id': 80...",51,15,1957-04-01,1974-03-28,1974-03-28,1991-03-24,1991-03-24,...,0.000000e+00,5.620000,5.850000,4.200000,6.900000,5.000000,3.333333,8.622700,6.832000,13.248000
35,Abe Vigoda,2,"[[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'na...",43,44,1965-11-10,1979-11-07,1979-11-07,1993-11-03,1993-11-03,...,1.449454e+07,6.778571,5.538462,5.176471,12.000000,20.769231,8.294118,15.844286,13.599154,14.722000
42,Adam Arkin,2,"[[{'id': 35, 'name': 'Comedy'}], [{'id': 10751...",52,48,1969-01-01,1985-12-28,1985-12-28,2002-12-24,2002-12-24,...,3.187292e+07,5.620000,6.213043,6.440000,7.700000,6.260870,5.533333,14.309300,17.324913,17.235933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7955,Yorgo Voyagis,2,"[[{'id': 18, 'name': 'Drama'}], [{'id': 53, 'n...",57,38,1964-12-14,1983-12-10,1983-12-10,2002-12-05,2002-12-05,...,2.431839e+05,6.106250,5.626667,5.657143,7.312500,5.800000,12.857143,11.553000,11.282600,10.125571
7964,Yuen Wah,2,"[[{'id': 28, 'name': 'Action'}, {'id': 12, 'na...",61,152,1962-12-05,1982-11-30,1982-11-30,2002-11-25,2002-11-25,...,2.498144e+07,6.193548,6.115385,5.497368,14.435484,7.384615,5.973684,13.755565,14.107212,13.824447
7980,Yuko Tanaka,1,"[[{'id': 18, 'name': 'Drama'}], [{'id': 18, 'n...",42,36,1981-01-08,1995-01-05,1995-01-05,2009-01-01,2009-01-01,...,4.493499e+06,6.980000,6.850000,6.585714,2.900000,3.666667,3.714286,19.200000,16.964500,18.901571
8046,Zoë Wanamaker,1,"[[{'id': 18, 'name': 'Drama'}], [], [{'id': 18...",46,36,1973-04-19,1988-04-15,1988-04-15,2003-04-12,2003-04-12,...,2.618603e+07,7.250000,7.266667,6.918750,9.500000,5.416667,5.562500,21.906875,21.665500,22.918563


In [54]:
# save the dataframes into csv files 
group1_divided_df.to_csv('../Data/preprocessed_data/group1_divided_df.csv', index=True)
group2_divided_df.to_csv('../Data/preprocessed_data/group2_divided_df.csv', index=True)
group3_divided_df.to_csv('../Data/preprocessed_data/group3_divided_df.csv', index=True)


In [25]:
import pandas as pd
full_actors_filtered_df = pd.read_csv('../Data/preprocessed_data/full_actors_filtered_db.csv')
#display(full_actors_filtered_df)

# get the id of the actor named '50 Cent'
group1_df = pd.read_csv('../Data/preprocessed_data/group1_divided_df.csv')
group2_df = pd.read_csv('../Data/preprocessed_data/group2_divided_df.csv')
group3_df = pd.read_csv('../Data/preprocessed_data/group3_divided_df.csv')
#display(group1_df)

In [12]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Use name of actor
def plot_movies_ratings_2(actor_id, start_date, end_date):
    actor_movies_sorted = full_actors_filtered_df[full_actors_filtered_df['actor_id'] == actor_id].sort_values(by='release_date')
    actor_name = actor_movies_sorted['name'].values[0]
    # get dates of separation
    date_middle = group2_df[group2_df['name'] == actor_name]['start_date_2'].values[0]
    
    fig = px.scatter(actor_movies_sorted, x='release_date', y='imdb_average_rating',
                     hover_data=['original_title' , 'order'],
                     labels={'imdb_average_rating': 'IMDb Rating', 'release_date': 'Release Year', 'original_title': 'Movie name', 'order': 'Order'},
                     title=f'IMDb Ratings and Genres of Movies for Actor {actor_name}')
    
    rgba_light_yellow = (255, 255, 100, 0.5)  # light yellow
    rgba_light_orange = (255, 165, 0, 0.5)    # light orange
    rgba_light_red = (240, 128, 128, 0.5)      # light red

    fig.add_shape(
        go.layout.Shape(
            type="rect",
            x0=actor_movies_sorted['release_date'].min(),
            x1=date_middle,
            y0=0,
            y1=11,
            fillcolor=f"rgba{rgba_light_yellow}",
            opacity=0.5,
            layer="below",
            line_width=0,
        )
    )

    fig.add_shape(
        go.layout.Shape(
            type="rect",
            x0=date_middle,
            x1=actor_movies_sorted['release_date'].max(),
            y0=0,
            y1=11,
            fillcolor=f"rgba{rgba_light_orange}",
            opacity=0.5,
            layer="below",
            line_width=0,
        )
    )


    # Set specific colors for the 'order' column
    fig.update_traces(marker=dict(color=actor_movies_sorted['order'].map({
        0: 'red', 1: 'red',
        2: 'yellowgreen', 3: 'yellowgreen', 4: 'yellowgreen', 5: 'yellowgreen',
        # Add more mappings for other orders as needed
    })))

    fig.update_layout(
    xaxis=dict(
        range=[start_date, end_date],
    ))

    fig.show()


In [13]:
# Use name of actor
def plot_movies_ratings_1(actor_id, start_date, end_date):
    actor_movies_sorted = full_actors_filtered_df[full_actors_filtered_df['actor_id'] == actor_id].sort_values(by='release_date')
    actor_name = actor_movies_sorted['name'].values[0]
    # get dates of separation
    
    fig = px.scatter(actor_movies_sorted, x='release_date', y='imdb_average_rating',
                     hover_data=['original_title' , 'order'],
                     labels={'imdb_average_rating': 'IMDb Rating', 'release_date': 'Release Year', 'original_title': 'Movie name', 'order': 'Order'},
                     title=f'IMDb Ratings and Genres of Movies for Actor {actor_name}')
    
    rgba_light_yellow = (255, 255, 100, 0.5)  # light yellow
    rgba_light_orange = (255, 165, 0, 0.5)    # light orange
    rgba_light_red = (240, 128, 128, 0.5)      # light red

    fig.add_shape(
        go.layout.Shape(
            type="rect",
            x0=actor_movies_sorted['release_date'].min(),
            x1=actor_movies_sorted['release_date'].max(),
            y0=0,
            y1=11,
            fillcolor=f"rgba{rgba_light_yellow}",
            opacity=0.5,
            layer="below",
            line_width=0,
        )
    )

    # Set specific colors for the 'order' column
    fig.update_traces(marker=dict(color=actor_movies_sorted['order'].map({
        0: 'red', 1: 'red',
        2: 'yellowgreen', 3: 'yellowgreen', 4: 'yellowgreen', 5: 'yellowgreen',
        # Add more mappings for other orders as needed
    })))

    fig.update_layout(
    xaxis=dict(
        range=[start_date, end_date],)
    )

    fig.show()


In [30]:
def plot_movies_ratings_3(actor_id, start_date, end_date):
    actor_movies_sorted = full_actors_filtered_df[full_actors_filtered_df['actor_id'] == actor_id].sort_values(by='release_date')
    actor_name = actor_movies_sorted['name'].values[0]
    # get dates of separation
    date_middle_1 = group3_df[group3_df['name'] == actor_name]['start_date_2'].values[0]
    date_middle_2 = group3_df[group3_df['name'] == actor_name]['start_date_3'].values[0]
    
    fig = px.scatter(actor_movies_sorted, x='release_date', y='imdb_average_rating',
                     hover_data=['original_title' , 'order'],
                     labels={'imdb_average_rating': 'IMDb Rating', 'release_date': 'Release Year', 'original_title': 'Movie name', 'order': 'Order'},
                     title=f'IMDb Ratings and Genres of Movies for Actor {actor_name}')
    
    rgba_light_yellow = (255, 255, 100, 0.5)  # light yellow
    rgba_light_orange = (255, 165, 0, 0.5)    # light orange
    rgba_light_red = (240, 128, 128, 0.5)      # light red

    fig.add_shape(
        go.layout.Shape(
            type="rect",
            x0=actor_movies_sorted['release_date'].min(),
            x1=date_middle_1,
            y0=0,
            y1=11,
            fillcolor=f"rgba{rgba_light_yellow}",
            opacity=0.5,
            layer="below",
            line_width=0,
        )
    )

    fig.add_shape(
        go.layout.Shape(
            type="rect",
            x0=date_middle_1,
            x1=date_middle_2,
            y0=0,
            y1=11,
            fillcolor=f"rgba{rgba_light_orange}",
            opacity=0.5,
            layer="below",
            line_width=0,
        )
    )

    fig.add_shape(
        go.layout.Shape(
            type="rect",
            x0=date_middle_2,
            x1=actor_movies_sorted['release_date'].max(),
            y0=0,
            y1=11,
            fillcolor=f"rgba{rgba_light_red}",
            opacity=0.5,
            layer="below",
            line_width=0,
        )
    )


    # Set specific colors for the 'order' column
    fig.update_traces(marker=dict(color=actor_movies_sorted['order'].map({
        0: 'red', 1: 'red',
        2: 'yellowgreen', 3: 'yellowgreen', 4: 'yellowgreen', 5: 'yellowgreen',
    })))

    fig.update_layout(
    xaxis=dict(
        range=[start_date, end_date],
    ))
    

    fig.show()


In [35]:
from datetime import datetime, timedelta
id_1 = full_actors_filtered_df[full_actors_filtered_df['name'] == 'Zoe Lister-Jones']['actor_id'].iloc[0]
id_2 = full_actors_filtered_df[full_actors_filtered_df['name'] == 'A.C. Peterson']['actor_id'].iloc[0]
id_3 = full_actors_filtered_df[full_actors_filtered_df['name'] == 'Jackie Chan']['actor_id'].iloc[0]

first_release_date_1 = full_actors_filtered_df[full_actors_filtered_df['name'] == 'Zoe Lister-Jones']['release_date'].min()
first_release_date_2 = full_actors_filtered_df[full_actors_filtered_df['name'] == 'A.C. Peterson']['release_date'].min()
first_release_date_3 = full_actors_filtered_df[full_actors_filtered_df['name'] == 'Jackie Chan']['release_date'].min()
# get the earlier release date out of the 3
first_release_date = min(first_release_date_1, first_release_date_2, first_release_date_3)

# get min release date
last_release_date_1 = full_actors_filtered_df[full_actors_filtered_df['name'] == 'Zoe Lister-Jones']['release_date'].max()
last_release_date_2 = full_actors_filtered_df[full_actors_filtered_df['name'] == 'A.C. Peterson']['release_date'].max()
last_release_date_3 = full_actors_filtered_df[full_actors_filtered_df['name'] == 'Jackie Chan']['release_date'].max()
# get the later release date out of the 3
last_release_date = max(last_release_date_1, last_release_date_2, last_release_date_3)

# ---------------------
start_date = datetime.strptime(first_release_date, '%Y-%m-%d')
end_date = datetime.strptime(last_release_date, '%Y-%m-%d')

start_date -= timedelta(days=365)
end_date += timedelta(days=365)

# Convert back to string format
first_release_date = start_date.strftime('%Y-%m-%d')
last_release_date = end_date.strftime('%Y-%m-%d')

#---------------------

plot_movies_ratings_1(id_1, first_release_date, last_release_date)
plot_movies_ratings_2(id_2, first_release_date, last_release_date)
plot_movies_ratings_3(id_3, first_release_date, last_release_date)