# Performing Sentiment Analysis on Description, Tagline, and Overview

In [9]:
from transformers import pipeline

classifier = pipeline(
    "sentiment-analysis", 
    model="distilbert-base-uncased-finetuned-sst-2-english",
    truncation=True
    )

Device set to use cpu

In [10]:
import pandas as pd
import numpy as np

movies_df = pd.read_csv("../data/output_data/clean_movies.csv")

#Description Sentiment Analysis
movies_df["Descr_Sentiment"] = movies_df["Description"].apply(lambda x: classifier(x)[0]["label"])

In [11]:
# Overview Sentiment Analysis
movies_df["Tagline_Sentiment"] = movies_df["Tagline"].apply(lambda x: classifier(x)[0]["label"])

In [13]:
# converting the neg/pos to label 0/1
movies_df["Descr_Sentiment"] = movies_df["Descr_Sentiment"].replace({"NEGATIVE": 0, "POSITIVE": 1})
movies_df["Tagline_Sentiment"] = movies_df["Tagline_Sentiment"].replace({"NEGATIVE": 0, "POSITIVE": 1})

  movies_df["Descr_Sentiment"] = movies_df["Descr_Sentiment"].replace({"NEGATIVE": 0, "POSITIVE": 1})
  movies_df["Tagline_Sentiment"] = movies_df["Tagline_Sentiment"].replace({"NEGATIVE": 0, "POSITIVE": 1})

In [14]:
# dropping the original columns
movies_df = movies_df.drop(columns=["Tagline", "Description"])

In [15]:
movies_df.to_csv("../data/output_data/clean_movies_sent.csv", index=False)

## Creating the Starpower Variable

In [None]:
import pandas as pd
import numpy as np

movies_df = pd.read_csv("../data/output_data/clean_movies_sent_fem.csv")

In [None]:
# Loading in the list of A-list and B-list actors/actresses 
a_list_actors_df = pd.read_csv("../data/input_data/a_list_actors.csv")
a_list_actresses_df = pd.read_csv("../data/input_data/a_list_actresses.csv")
b_list_df = pd.read_csv("../data/input_data/b_list.csv")

# taking just the names from the dataframes
a_list_actors = a_list_actors_df['Name'].tolist()
a_list_actors = pd.DataFrame(a_list_actors, columns=['Name'])

a_list_actresses = a_list_actresses_df['Name'].tolist()
a_list_actresses = pd.DataFrame(a_list_actresses, columns=['Name'])

b_list = b_list_df['Name'].tolist()
b_list = pd.DataFrame(b_list, columns=['Name'])

# combining the A-list actors and actresses into one dataframe
a_list = pd.concat([a_list_actors, a_list_actresses], ignore_index=True)

print(a_list.head(5))
print(b_list.head(5))

num_a_list = len(a_list)
num_b_list = len(b_list)

print(f'Number of A-listers: {num_a_list}')
print(f'Number of b-listers: {num_b_list}')

In [None]:
# Creating the variable to calculate the starpower metric

# splitting to get all four actors/actresses into separate columns
if movies_df['Actors'].notna().any():
    actors_split = movies_df['Actors'].str.split(',', expand=True)
    movies_df['actor1'] = actors_split[0].str.strip()
    movies_df['actor2'] = actors_split[1].str.strip()
    movies_df['actor3'] = actors_split[2].str.strip()
else:
    print("The 'Actors' column is empty or missing.")

# creating the metric column
movies_df['starpower'] = 0

# Check if the first actor is in the a-list
movies_df.loc[movies_df['actor1'].isin(a_list['Name'].values), 'starpower'] += 2
movies_df.loc[movies_df['actor2'].isin(a_list['Name'].values), 'starpower'] += 2
movies_df.loc[movies_df['actor3'].isin(a_list['Name'].values), 'starpower'] += 2

movies_df.loc[movies_df['actor1'].isin(b_list['Name'].values), 'starpower'] += 1
movies_df.loc[movies_df['actor2'].isin(b_list['Name'].values), 'starpower'] += 1
movies_df.loc[movies_df['actor3'].isin(b_list['Name'].values), 'starpower'] += 1

# making it an average value
movies_df['starpower'] = movies_df['starpower'] / 3

# checking the new variable
print(movies_df[['actor1', 'actor2', 'actor3', 'starpower']].head(10))

# dropping the actor columns
movies_df = movies_df.drop(columns=["actor1", "actor2", "actor3", "Actors"])

In [None]:
movies_df.to_csv("../data/output_data/clean_movies_sent_fem_star.csv", index=False)

## Creating the Female Actress Lead Variable

In [None]:
import pandas as pd
import numpy as np

movies_df = pd.read_csv("../data/output_data/clean_movies_sent.csv")

In [None]:
# checking how accurate the first listed actor/actress is in terms of being the star or main character
first_actors = movies_df['Actors'].str.split(',').str[0].str.strip()
print(first_actors.head(20))

print(movies_df['Title'].head(20))

In [None]:
# getting the female actresses names from excel file

# opening the file
female_df = pd.read_excel("../data/input_data/female_actresses.xlsx")
female_df = female_df.rename(columns={'A': 'Name'})

# splitting to get the names only
female_df['Name'] = female_df['Name'].str.split(' born').str[0].str.split(r'\d').str[0].str.strip()

# deleting the rows that are just the letter headers
female_df = female_df[~female_df['Name'].isin(list('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))]

num_actresses = len(female_df)
print(f'There are {num_actresses} actresses in the list')

In [None]:
# Creating the variable to indicate if first actor is female
first_actors = movies_df['Actors'].str.split(',').str[0].str.strip()
movies_df['First_Actor'] = first_actors

# Check if the first actor is in the female name list
movies_df['Female_Lead'] = movies_df['First_Actor'].apply(lambda x: int(any(name in x for name in female_df['Name'])))

# checking the new variable
print(movies_df[['Title', 'First_Actor', 'Female_Lead']].head(10))
print(movies_df[['Title', 'First_Actor', 'Female_Lead']].tail(10))

# dropping the first_Actor column
movies_df = movies_df.drop(columns=["First_Actor"])

In [None]:
# Count of the split
female_lead_count = movies_df['Female_Lead'].value_counts()
print(female_lead_count)

In [None]:
movies_df.to_csv("../data/output_data/clean_movies_sent_fem.csv", index=False)