In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from wikipediaapi import Wikipedia
import plotly.graph_objects as go
from plotly.subplots import make_subplots


movies = pd.read_csv('data/all_movies_with_success_index.csv')
actors = pd.read_csv('data/actor_with_scoreindex.csv')

actors

Unnamed: 0,Actor name,Cumulative Score,Actor Score Index
0,'squeeks'_the_caterpillar,1.779819,4.116636
1,40_glocc,1.393875,2.817460
2,50_cent,1.776826,4.106561
3,a._delon_ellis_jr.,1.676044,3.767307
4,a._j._benza,1.821579,4.257209
...,...,...,...
20688,éva_igó,1.484994,3.124189
20689,íñigo_garcés,1.622983,3.588689
20690,óscar_jaenada,1.648598,3.674915
20691,željko_ivanek,2.118027,5.255119


In [3]:
actors['Actor Score Index'].describe()

count    20693.000000
mean         3.884397
std          0.729746
min          0.000000
25%          3.520722
50%          3.890783
75%          4.266474
max         10.000000
Name: Actor Score Index, dtype: float64

In [11]:
actors.sort_values(by='Actor Score Index', ascending=False, inplace=True)


def top_worst_actors(method, percentage = None, number = None): 
    if method == 'percentage': 
        percentage =    percentage
        top_actors = actors.iloc[:int(percentage*len(actors))]
        worst_actors = actors.iloc[-int(percentage*len(actors)):]
        return top_actors, worst_actors
    if method == 'number':
        number = number 
        top_actors = actors.iloc[:number]
        worst_actors = actors.iloc[-number:]
        return top_actors, worst_actors

top_actors, worst_actors = top_worst_actors('number', number=len(actors)//3)
top_actors

Unnamed: 0,Actor name,Cumulative Score,Actor Score Index
4946,denzel_washington,3.527584,10.000000
13290,matt_damon,3.435967,9.691597
19440,tom_hanks,3.274563,9.148275
6445,frank_welker,3.253477,9.077296
5535,eddie_murphy,3.205583,8.916074
...,...,...,...
20481,xavier_massimi,1.780499,4.118925
569,alicia_jaziz_zapien,1.780499,4.118925
10502,justin_sundquist,1.780499,4.118925
5141,dolores_heredia,1.780499,4.118925


In [12]:
fig = go.Figure()

# Add histogram for top actors
fig.add_trace(go.Histogram(
    x=actors['Actor Score Index'],
    nbinsx=500,
    name='Top Actors',
    opacity=0.5,
    marker_color='pink'
))

fig.update_layout(
    title='Distribution of actor sucess index',
    xaxis_title='Actor Score Index',
    yaxis_title='Frequency',
    barmode='overlay',
    legend=dict(x=0.8, y=1),
    template='plotly_white'
)

# Show plot
fig.show()

In [13]:
# plot the distribution of the 25% worst and best actors
 
fig = go.Figure()

# Add histogram for top actors
fig.add_trace(go.Histogram(
    x=top_actors['Actor Score Index'],
    nbinsx=100,
    name='Top Actors',
    opacity=0.5,
    marker_color='green'
))

# Add histogram for worst actors
fig.add_trace(go.Histogram(
    x=worst_actors['Actor Score Index'],
    nbinsx=100,
    name='Worst Actors',
    opacity=0.5,
    marker_color='red'
))

fig.update_layout(
    title='Distribution of actor sucess index',
    xaxis_title='Actor Score Index',
    yaxis_title='Frequency',
    barmode='overlay',
    legend=dict(x=0.8, y=1),
    template='plotly_white'
)

# Show plot
fig.show()


In [44]:

# Best actors data
top_actors_names = top_actors['Actor name'].values.tolist()
top_movies = movies[movies['Actor name'].isin(top_actors_names)]
top_movies = top_movies.drop_duplicates(subset='Movie name')
top_companies_best = top_movies['Movie company'].value_counts().head(15)

top_companies_best


worst_actors_names = worst_actors['Actor name'].values.tolist()
worst_movies = movies[movies['Actor name'].isin(worst_actors_names)]
worst_movies = worst_movies.drop_duplicates(subset='Movie name')
top_companies_worst = worst_movies['Movie company'].value_counts().head(15)

top_companies_worst

# find movies common to both top and worst actors
common_movies = top_movies[top_movies['Movie name'].isin(worst_movies['Movie name'])]
common_movies

# len top movies before and after removing common movies]
print(f'top movies before removing common movies: {len(top_movies)}')
top_movies = top_movies[~top_movies['Movie name'].isin(common_movies['Movie name'])]
print(f'top movies after removing common movies: {len(top_movies)}')

# len worst movies before and after removing common movies]
print(f'worst movies before removing common movies: {len(worst_movies)}')
worst_movies = worst_movies[~worst_movies['Movie name'].isin(common_movies['Movie name'])]
print(f'worst movies after removing common movies: {len(worst_movies)}')


fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}]],
                    subplot_titles=['Top 15 Best Actors Movie Companies', 'Top 15 Worst Actors Movie Companies'])

# Add the pie chart for best actors
fig.add_trace(go.Pie(
    labels=top_companies_best.index,
    values=top_companies_best.values,
    hole=0.3,
    name='Best Actors'
), row=1, col=1)

# Add the pie chart for worst actors
fig.add_trace(go.Pie(
    labels=top_companies_worst.index,
    values=top_companies_worst.values,
    hole=0.3,
    name='Worst Actors'
), row=1, col=2)

# Update layout
fig.update_layout(
    title_text='Comparison of Movie Companies for Best and Worst Actors',
    template='plotly_white'
)

fig.show()

top movies before removing common movies: 3408
top movies after removing common movies: 333
worst movies before removing common movies: 3168
worst movies after removing common movies: 93


## Scrapping nationality 


In [None]:
# actors_to_scrap = pd.read_csv('data/actor_unique.csv')


# USER_AGENT = "ADAProject (example@epfl.ch)"
# # Initialize Wikipedia API
# wiki_wiki = Wikipedia(
#     language='en', 
#     user_agent=USER_AGENT,
#     headers={'User-Agent': USER_AGENT}
# )



In [None]:
# def fetch_actor_summary(actor_name):
#     try:
#         page = wiki_wiki.page(actor_name)
#         if page.exists():
#             return page.summary  # Fetch the full summary
#     except Exception as e:
#         print(f"Error fetching summary for {actor_name}: {e}")
#     return None


# import re
# def extract_nationality_after_born(sentence):
#     # Handle None input
#     if sentence is None:
#         return None

#     # Locate the first occurrence of "born"
#     born_index = sentence.lower().find("born")
#     if born_index == -1:
#         return None  # Return None if "born" is not found

#     # Extract the 300 characters after "born" for more context
#     text_after_born = sentence[born_index:born_index + 300]

#     # Regex pattern to find "[nationality] actor" or similar
#     match = re.search(r"\b([A-Za-z]+)\s(?:actor|actress|director|music producer|singer|rapper|film|journalist|gossip columnist|playwright|film director|acting|multi-award winning|radio personality|comedian|radio personality|professional wrestler|comedian|voice actor|former actor|screenwriter|artist|television personality|music executive)", text_after_born, re.IGNORECASE)
#     if match:
#         return match.group(1).capitalize()  # Return the nationality, capitalized

#     # Fallback: Look for standalone nationality (e.g., "American") near "actor"
#     fallback_match = re.search(r"\b(American|English|Australian|Canadian|British|French|Italian|German|Spanish)\b", text_after_born, re.IGNORECASE)
#     if fallback_match:
#         return fallback_match.group(1).capitalize()

#     # Final fallback: Use the last word before "actor" if no other nationality is found
#     pre_actor_match = re.search(r"([A-Za-z]+)\sactor", text_after_born, re.IGNORECASE)
#     if pre_actor_match:
#         return pre_actor_match.group(1).capitalize()

#     return None  # Return None if no match is found


# # Update the processing loop to use the improved extraction
# list_nationality = []
# not_found = 0 

# for actor in actors:
#     nationality_info = fetch_actor_summary(actor)  # Fetch the summary
#     if nationality_info:
#         nationality = extract_nationality_after_born(nationality_info)
#         if nationality: 
#             list_nationality.append(nationality)
#             print(f"Actor: {actor}\nExtracted Nationality: {nationality}\n")
#         else:
#             print(f"Actor: {actor}\nNationality Info: {nationality_info[:200]}\nExtracted Nationality: None\n")
#     else:
#         not_found += 1
#         print(f"Actor: {actor}\nNationality Info: Not found\nExtracted Nationality: Not found\n")

# print(f"Number of actors not found: {not_found}")

Actor: Actor name
Nationality Info: Not found
Extracted Nationality: Not found

Actor: Cumulative Score
Nationality Info: Not found
Extracted Nationality: Not found

Actor: Actor Score Index
Nationality Info: Not found
Extracted Nationality: Not found

Number of actors not found: 3
