# Creating the metric of starpower within the film

In [20]:
import pandas as pd
import numpy as np

movies_df = pd.read_csv("../data/output_data/clean_movies_sent_fem.csv")

In [12]:
# Loading in the list of A-list and B-list actors/actresses 
a_list_actors_df = pd.read_csv("../data/input_data/a_list_actors.csv")
a_list_actresses_df = pd.read_csv("../data/input_data/a_list_actresses.csv")
b_list_df = pd.read_csv("../data/input_data/b_list.csv")

# taking just the names from the dataframes
a_list_actors = a_list_actors_df['Name'].tolist()
a_list_actors = pd.DataFrame(a_list_actors, columns=['Name'])

a_list_actresses = a_list_actresses_df['Name'].tolist()
a_list_actresses = pd.DataFrame(a_list_actresses, columns=['Name'])

b_list = b_list_df['Name'].tolist()
b_list = pd.DataFrame(b_list, columns=['Name'])

# combining the A-list actors and actresses into one dataframe
a_list = pd.concat([a_list_actors, a_list_actresses], ignore_index=True)

print(a_list.head(5))
print(b_list.head(5))

num_a_list = len(a_list)
num_b_list = len(b_list)

print(f'Number of A-listers: {num_a_list}')
print(f'Number of b-listers: {num_b_list}')


              Name
0        Al Pacino
1  Tommy Lee Jones
2    Harrison Ford
3   Robert De Niro
4    Kevin Costner
               Name
0      Ansel Elgort
1       Wes Bentley
2  Chiwetel Ejiofor
3        Karl Urban
4   Britt Robertson
Number of A-listers: 200
Number of b-listers: 376

In [27]:
# Creating the variable to calculate the starpower metric

# splitting to get all four actors/actresses into separate columns
if movies_df['Actors'].notna().any():
    actors_split = movies_df['Actors'].str.split(',', expand=True)
    movies_df['actor1'] = actors_split[0].str.strip()
    movies_df['actor2'] = actors_split[1].str.strip()
    movies_df['actor3'] = actors_split[2].str.strip()
else:
    print("The 'Actors' column is empty or missing.")

# creating the metric column
movies_df['starpower'] = 0

# Check if the first actor is in the a-list
movies_df.loc[movies_df['actor1'].isin(a_list['Name'].values), 'starpower'] += 2
movies_df.loc[movies_df['actor2'].isin(a_list['Name'].values), 'starpower'] += 2
movies_df.loc[movies_df['actor3'].isin(a_list['Name'].values), 'starpower'] += 2

movies_df.loc[movies_df['actor1'].isin(b_list['Name'].values), 'starpower'] += 1
movies_df.loc[movies_df['actor2'].isin(b_list['Name'].values), 'starpower'] += 1
movies_df.loc[movies_df['actor3'].isin(b_list['Name'].values), 'starpower'] += 1

# making it an average value
movies_df['starpower'] = movies_df['starpower'] / 3

# checking the new variable
print(movies_df[['actor1', 'actor2', 'actor3', 'starpower']].head(10))

# dropping the actor columns
movies_df = movies_df.drop(columns=["actor1", "actor2", "actor3", "Actors"])

           actor1             actor2           actor3  starpower
0   Mark Wahlberg      Tyrese Gibson       André 3000   0.666667
1      Jamie Bell        Andy Serkis     Daniel Craig   1.333333
2   Ryan Reynolds       Blake Lively  Peter Sarsgaard   0.333333
3     Marc Singer      Tanya Roberts         Rip Torn   0.000000
4  Tom Hiddleston  Samuel L. Jackson      Brie Larson   1.000000
5   Jeremy Renner           Ed Helms     Jake Johnson   0.000000
6   Frankie Muniz       Amanda Bynes    Paul Giamatti   1.000000
7      Ben Barnes     Skandar Keynes   Georgie Henley   0.000000
8   Jason Bateman        Charlie Day   Jason Sudeikis   1.000000
9      Jack Black  Ana de la Reguera   Héctor Jiménez   0.333333

In [28]:
movies_df.to_csv("../data/output_data/clean_movies_sent_fem_star.csv", index=False)