# Project P3 - Feminism in movies

### Import the data

In [34]:
from helpers import *
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

file_source = 'C:/Users/jacqu/OneDrive/Documents/GitHub/ada-2024-project-theadavengers/Data/'
# file_source ='/Users/mariannecivitardevol/Documents/EPFL/Masters/Year 2/ADA copy/'
# bechdel_path2 = "/Users/mariannecivitardevol/Documents/EPFL/Masters/Year 2/ADA copy/MovieSummaries/movies_feminism.csv"
#### Movie metadata #### DF
MovieMetadata_df = pd.read_csv(file_source +'/MovieSummaries/movie.metadata.tsv', sep='\t', header=None)

#### Character metadata #### DF
CharacterMetadata_df = pd.read_csv(file_source + '/MovieSummaries/character.metadata.tsv', sep='\t', header=None)

# Define column headers as a list
MovieMetadata_df_headers = [
    'Wikipedia movie ID',
    'Freebase movie ID',
    'Movie name',
    'Movie release date',
    'Movie box office revenue',
    'Movie runtime',
    'Movie languages (Freebase ID:name tuples)',
    'Movie countries (Freebase ID:name tuples)',
    'Movie genres (Freebase ID:name tuples)'
]

MovieMetadata_df.columns = MovieMetadata_df_headers

# Define column headers for the second dataset
CharacterMetadata_df_headers = [
    'Wikipedia movie ID',
    'Freebase movie ID',
    'Movie release date',
    'Character name',
    'Actor date of birth',
    'Actor gender',
    'Actor height (in meters)',
    'Actor ethnicity (Freebase ID)',
    'Actor name',
    'Actor age at movie release',
    'Freebase character/actor map ID',
    'Freebase character ID',
    'Freebase actor ID'
]

CharacterMetadata_df.columns = CharacterMetadata_df_headers

# Text data 
names_df = pd.read_csv(file_source+'MovieSummaries/name.clusters.txt', sep="\t", header=None)
plot_summaries_df = pd.read_csv(file_source+'MovieSummaries/plot_summaries.txt', sep="\t", header=None)
tvTropes_df = pd.read_csv(file_source+'MovieSummaries/tvtropes.clusters.txt', sep="\t", header=None)

names_df_headers = [
    'Character Names',
    'Instances'
]

names_df.columns = names_df_headers

tvTropes_df_headers = [
    'Character Types',
    'Instances'
]

tvTropes_df.columns = tvTropes_df_headers


plot_summaries_df_headers = [
    'Wikipedia movie ID',
    'Summaries'
]

plot_summaries_df.columns = plot_summaries_df_headers

# Create a dataframe with the bechdel data as well as plots
bechdel_data2 = pd.read_csv('Data\Bechdel_IMDB_Merge0524.csv')
# drop the NaN movie plots
plot_summaries_df = plot_summaries_df.dropna(subset=['Summaries'])

imdb_df = pd.read_csv('Data/imdb_movies.csv')

MovieMetadata_df.head()

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages (Freebase ID:name tuples),Movie countries (Freebase ID:name tuples),Movie genres (Freebase ID:name tuples)
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


### Train an SVM on a made-up metric

**Compute the percentage of women characters/actor in a movie (in the IMDB dataset)**

In [33]:
import pandas as pd
import gender_guesser.detector as gender

# Initialize gender detector
detector = gender.Detector()

# Load the dataset
file_path = 'Data/imdb_movies.csv'  # Replace with the actual path to your file
imdb_movies = pd.read_csv(file_path)

# Function to extract only actor names (every other item in the list, starting with the first)
def extract_actor_names(crew_string):
    if pd.isna(crew_string):
        return []
    # Split the string by commas
    crew_list = crew_string.split(',')
    # Strip whitespace and filter out empty strings
    crew_list = [item.strip() for item in crew_list if item.strip()]
    # Take every other element starting with the first as actor names
    actor_names = crew_list[::2]
    return actor_names

# Apply the function to extract actor names
imdb_movies['actor_names'] = imdb_movies['crew'].apply(extract_actor_names)

# Flatten the list of actor names for gender inference
all_actor_names = [name for sublist in imdb_movies['actor_names'] for name in sublist]

# Infer gender for each unique actor name
name_to_gender = {name: detector.get_gender(name.split()[0]) for name in set(all_actor_names)}

# Function to infer genders for a list of actor names
def infer_genders(actor_names):
    return [name_to_gender.get(name, 'unknown') for name in actor_names]

# Apply gender inference to the actor names
imdb_movies['genders'] = imdb_movies['actor_names'].apply(infer_genders)

# Calculate the proportion of female actors per movie
def calculate_female_proportion(gender_list):
    if len(gender_list) == 0:
        return 0
    female_count = sum(1 for gender in gender_list if gender in ['female', 'mostly_female'])
    return female_count / len(gender_list)

imdb_movies['female_proportion'] = imdb_movies['genders'].apply(calculate_female_proportion)

In [36]:
# Create a metric in the merged_df that is bechdel_rating+3*female_proportion

# Merge the bechdel columns that we need
bechdel_df = bechdel_data2[['title', 'bechdelRating']].copy()
bechdel_df.columns = ['Movie name', 'Bechdel rating']
merged_df = MovieMetadata_df.merge(bechdel_df, on='Movie name', how='inner')
merged_df = merged_df.merge(plot_summaries_df, on='Wikipedia movie ID', how='inner')
merged_df['Bechdel rating'] = (merged_df['Bechdel rating'] == 3).astype(int)

# Merge the imdb columns that we need
imdb_utility_df = imdb_movies[['names', 'female_proportion']].copy()
imdb_utility_df.columns = ['Movie name', 'Female proportion']
merged_df = merged_df.merge(imdb_utility_df, on='Movie name', how='inner')
print('Merged dataframe has', merged_df.shape[0], 'movies.')

# Create the metric
merged_df['Custom metric'] = merged_df['Bechdel rating'] + merged_df['Female proportion']
merged_df.head()

Merged dataframe has 4039 movies.


Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages (Freebase ID:name tuples),Movie countries (Freebase ID:name tuples),Movie genres (Freebase ID:name tuples),Bechdel rating,Summaries,Female proportion,Custom metric
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",1,"Set in the second half of the 22nd century, th...",0.444444,1.444444
1,77856,/m/0kcn7,Mary Poppins,1964-08-27,102272727.0,139.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0hj3myq"": ""Children's/Family"", ""/m/04t36""...",1,The film opens with Mary Poppins perched in a...,0.444444,1.444444
2,5664529,/m/0dyy_v,Vixen!,1968,,70.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01yldk"": ""Softcore Porn"", ""/m/06b0n3"": ""S...",1,"In the heart of the Canadian wilderness, sultr...",0.111111,1.111111
3,1765938,/m/05v5ws,Convoy,1978,45000000.0,111.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/03btsm8"": ""Action/Adventure"", ""/m/02kdv5l...",0,"{{Plot}} Truck driver Martin ""Rubber Duck"" Pen...",0.222222,0.222222
4,196176,/m/01bwgr,Straw Dogs,1971-11-03,11148828.0,118.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/01jfsb"": ""Thriller"", ""/m/07s9rl0"": ""Drama""}",0,"David Sumner , a timid American mathematician,...",0.222222,0.222222


**Train a SVM model**

In [46]:
from sklearn.model_selection import train_test_split

binary_feminist_list = (merged_df['Custom metric'] > 1.5).astype(int)

# X contains features; y contains the target variable
X = merged_df[['Bechdel rating', 'Female proportion', 'Custom metric']]
y = binary_feminist_list

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
display(X_train)

Unnamed: 0,Bechdel rating,Female proportion,Custom metric
1610,1,0.444444,1.444444
1941,1,0.111111,1.111111
3751,1,0.222222,1.222222
3189,0,0.111111,0.111111
1538,1,0.555556,1.555556
...,...,...,...
1130,1,0.555556,1.555556
1294,1,0.333333,1.333333
860,0,0.333333,0.333333
3507,0,0.222222,0.222222


In [47]:
from sklearn.svm import SVC

# Initialize and train the SVM
model = SVC(kernel='linear', random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

In [48]:
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[713   1]
 [  0  94]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       714
           1       0.99      1.00      0.99        94

    accuracy                           1.00       808
   macro avg       0.99      1.00      1.00       808
weighted avg       1.00      1.00      1.00       808



### Now use the previous classification to train a model on the plot summaries to apply to all the movies