In [127]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

# Load Datasets

In [128]:
data_folder = 'Data_scripts/'
character_list5 = pd.read_csv(data_folder+'character_list5.csv', encoding='ISO-8859-1')
meta_data7 = pd.read_csv(data_folder+'meta_data7.csv', encoding='ISO-8859-1')

In [129]:
data_folder = 'OurData/'
data_character = pd.read_csv(data_folder+'clean_data_character.csv')
data_success = pd.read_csv(data_folder+'clean_data_success.csv')

In [130]:
import requests
url = 'http://bechdeltest.com/api/v1/getAllMovies'

response = requests.get(url)

if response.status_code == 200:
    print('Request was succesful')
    movie_data = response.json()
    df_bechdel = pd.DataFrame(movie_data)
    print(df_bechdel.sample())

else:
    print('Request failed')

Request was succesful
           title  rating    id   imdbid  year
8003  Effie Gray       3  6476  1605798  2014


### Datasets size:

Total bechdel = 10408

Total scripts = 2000

scripts + bechdel = 1525

scripts + bechdel + Success Metrics = 1077

In [131]:
character_list5.head()

Unnamed: 0,script_id,imdb_character_name,words,gender,age
0,280,betty,311,f,35.0
1,280,carolyn johnson,873,f,
2,280,eleanor,138,f,
3,280,francesca johns,2251,f,46.0
4,280,madge,190,f,46.0


In [132]:
meta_data7.head()

Unnamed: 0,script_id,imdb_id,title,year,gross,lines_data
0,1534,tt1022603,(500) Days of Summer,2009,37.0,7435445256774774443342577775657744434444564456...
1,1512,tt0147800,10 Things I Hate About You,1999,65.0,1777752320274533344457777722433777334443764677...
2,1514,tt0417385,12 and Holding,2005,,5461357777754212454544441367774433446547647753...
3,1517,tt2024544,12 Years a Slave,2013,60.0,4567334777777777777777447777756477777444777777...
4,1520,tt1542344,127 Hours,2010,20.0,453513352345765766777777773340


`meta_data7.csv`: This file contains metadata for movies, including unique IMDB IDs, release year, domestic box office gross (adjusted for inflation), and a column (lines_data) representing the distribution of dialogue lines by gender.

lines_data: A string where each pair of numerals represents the number of male dialogue lines for a half-minute segment of a movie.

Example: lines_data = "120704..." means:

* 1 = 1st half-minute: 1 male line
* 2 = 2nd half-minute: 2 male line
* 0 = 3rd half-minute: 0 male line
 
and so on.

In [133]:
def calculate_gender_proportion(lines_data):
    """
    Calculate the proportion of male and female speech from a lines_data string.

    Args:
        lines_data (str): A string of numbers representing male dialogue lines for half-minute segments.

    Returns:
        dict: A dictionary with the percentage of male and female speech.
    """
    if not lines_data:
        print('Invalid input')
        return {"male_percentage": 0, "female_percentage": 0}  # Handle invalid input
    
    # Initialize counters for male and total lines
    total_male_lines = 0
    total_lines = 0

    # Process each character (each half-minute segment)
    for char in lines_data:
        male_lines = int(char)  # Male lines for this half-minute segment
        total_male_lines += male_lines
        total_lines += 7  # Each half-minute has 7 total lines

    # Calculate percentages
    male_percentage = (total_male_lines / total_lines)
    female_percentage = 1 - male_percentage

    return {"male_percentage": male_percentage, "female_percentage": female_percentage}


In [134]:
meta_data7['male_script'] = meta_data7['lines_data'].apply(lambda x: calculate_gender_proportion(x)['male_percentage'])
meta_data7['female_script'] = meta_data7['lines_data'].apply(lambda x: calculate_gender_proportion(x)['female_percentage'])

meta_data7.head()

Unnamed: 0,script_id,imdb_id,title,year,gross,lines_data,male_script,female_script
0,1534,tt1022603,(500) Days of Summer,2009,37.0,7435445256774774443342577775657744434444564456...,0.680672,0.319328
1,1512,tt0147800,10 Things I Hate About You,1999,65.0,1777752320274533344457777722433777334443764677...,0.580392,0.419608
2,1514,tt0417385,12 and Holding,2005,,5461357777754212454544441367774433446547647753...,0.654646,0.345354
3,1517,tt2024544,12 Years a Slave,2013,60.0,4567334777777777777777447777756477777444777777...,0.821756,0.178244
4,1520,tt1542344,127 Hours,2010,20.0,453513352345765766777777773340,0.695238,0.304762


In [135]:
df_bechdel.rename(columns={'year':'Year'}, inplace= True)
df_bechdel.rename(columns={'rating':'Bechdel_score'}, inplace= True)
df_bechdel.head()

Unnamed: 0,title,Bechdel_score,id,imdbid,Year
0,Passage de Venus,0,9602,3155794,1874
1,La Rosace Magique,0,9804,14495706,1877
2,Sallie Gardner at a Gallop,0,9603,2221420,1878
3,Le singe musicien,0,9806,12592084,1878
4,Athlete Swinging a Pick,0,9816,7816420,1881


In [136]:
df_bechdel['imdbid'] = df_bechdel['imdbid'].apply(lambda x: 'tt' + x)
df_bechdel.head()

Unnamed: 0,title,Bechdel_score,id,imdbid,Year
0,Passage de Venus,0,9602,tt3155794,1874
1,La Rosace Magique,0,9804,tt14495706,1877
2,Sallie Gardner at a Gallop,0,9603,tt2221420,1878
3,Le singe musicien,0,9806,tt12592084,1878
4,Athlete Swinging a Pick,0,9816,tt7816420,1881


In [137]:
meta_data7.head()

Unnamed: 0,script_id,imdb_id,title,year,gross,lines_data,male_script,female_script
0,1534,tt1022603,(500) Days of Summer,2009,37.0,7435445256774774443342577775657744434444564456...,0.680672,0.319328
1,1512,tt0147800,10 Things I Hate About You,1999,65.0,1777752320274533344457777722433777334443764677...,0.580392,0.419608
2,1514,tt0417385,12 and Holding,2005,,5461357777754212454544441367774433446547647753...,0.654646,0.345354
3,1517,tt2024544,12 Years a Slave,2013,60.0,4567334777777777777777447777756477777444777777...,0.821756,0.178244
4,1520,tt1542344,127 Hours,2010,20.0,453513352345765766777777773340,0.695238,0.304762


In [138]:
bechdel_script = pd.merge(meta_data7, 
                            df_bechdel[['Bechdel_score', 'imdbid']], 
                            left_on='imdb_id', 
                            right_on='imdbid',
                            how='inner')

In [139]:
bechdel_script.rename(columns={'year':'Year'}, inplace= True)
bechdel_script.rename(columns={'rating':'Bechdel_score'}, inplace= True)
bechdel_script.drop('lines_data', axis=1, inplace =True)
bechdel_script.drop('gross', axis=1, inplace =True)
bechdel_script.drop('imdbid', axis=1, inplace =True)
bechdel_script.head()

Unnamed: 0,script_id,imdb_id,title,Year,male_script,female_script,Bechdel_score
0,1534,tt1022603,(500) Days of Summer,2009,0.680672,0.319328,1
1,1512,tt0147800,10 Things I Hate About You,1999,0.580392,0.419608,3
2,1514,tt0417385,12 and Holding,2005,0.654646,0.345354,3
3,1517,tt2024544,12 Years a Slave,2013,0.821756,0.178244,3
4,1520,tt1542344,127 Hours,2010,0.695238,0.304762,3


In [140]:

bechdel_script.head()

Unnamed: 0,script_id,imdb_id,title,Year,male_script,female_script,Bechdel_score
0,1534,tt1022603,(500) Days of Summer,2009,0.680672,0.319328,1
1,1512,tt0147800,10 Things I Hate About You,1999,0.580392,0.419608,3
2,1514,tt0417385,12 and Holding,2005,0.654646,0.345354,3
3,1517,tt2024544,12 Years a Slave,2013,0.821756,0.178244,3
4,1520,tt1542344,127 Hours,2010,0.695238,0.304762,3


In [141]:
data_success['Normalized_Title'] = data_success['title'].str.replace(" ", "").str.lower()
bechdel_script['Normalized_Title'] = bechdel_script['title'].str.replace(" ", "").str.lower()

success_bechdel_script = pd.merge(data_success, bechdel_script , on=['Normalized_Title', 'Year'], how='inner')

In [142]:
success_bechdel_script.rename(columns={'title_x': 'title'}, inplace=True)
success_bechdel_script.drop('title_y', axis=1, inplace =True)
success_bechdel_script.drop('Normalized_Title', axis=1, inplace =True)
success_bechdel_script.drop('BoxOfficeStandardize', axis=1, inplace =True)
success_bechdel_script.head()

Unnamed: 0.1,Unnamed: 0,Wiki_ID,Movie_ID,title,release_date,BoxOfficeRevenue,Runtime,Languages,Countries,Genres,...,Net_revenue,rating,BoxOfficeRank,RatingRank,SuccessMetric,script_id,imdb_id,male_script,female_script,Bechdel_score
0,233,103021,/m/0p_sc,Midnight Express,1978-08-31,35000000.0,121.0,"{""/m/064_8sq"": ""French Language"", ""/m/02h40lc""...","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/01jfsb"": ""Thriller"", ""/m/0hn10"": ""LGBT"", ...",...,32700000.0,7.55,0.627565,0.919481,0.773523,1155,tt0077928,0.947479,0.052521,0
1,298,5549065,/m/0ds11z,Sweeney Todd: The Demon Barber of Fleet Street,2007-12-03,152523164.0,117.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/0lsxr"": ""Crime Fiction"", ""/m/0fx2s"": ""Tra...",...,102523164.0,7.15,0.824737,0.820454,0.822595,5083,tt0408236,0.640693,0.359307,1
2,346,1033297,/m/0401sg,Resident Evil,2002-03-12,102441078.0,100.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",...,69441078.0,6.5,0.758458,0.573173,0.665816,3204,tt0120804,0.605042,0.394958,3
3,555,30548208,/m/0g9wdmc,The Iron Lady,2011-12-26,114943631.0,104.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0f8l9c"": ""France"", ""/m/07ssc"": ""United Ki...","{""/m/017fp"": ""Biography"", ""/m/07s9rl0"": ""Drama""}",...,101943631.0,6.3,0.823905,0.489887,0.656896,2634,tt1007029,0.356077,0.643923,3
4,562,1964091,/m/069dl1,Next,2007-04-25,76066841.0,95.0,"{""/m/064_8sq"": ""French Language"", ""/m/04306rv""...","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",...,6066841.0,6.050048,0.410427,0.389811,0.400119,1190,tt0435705,0.556068,0.443932,2


In [143]:
success_bechdel_script['num_actors'] = success_bechdel_script['Movie_ID'].map(data_character.groupby('Movie_ID')['Actor_ID'].nunique())
success_bechdel_script['num_women'] = success_bechdel_script['Movie_ID'].map(data_character[data_character['actor_gender']=='F'].groupby('Movie_ID')['Actor_ID'].nunique())
success_bechdel_script['ratio_W/M'] = success_bechdel_script['num_women']/success_bechdel_script['num_actors']

In [144]:
success_bechdel_script.head()

Unnamed: 0.1,Unnamed: 0,Wiki_ID,Movie_ID,title,release_date,BoxOfficeRevenue,Runtime,Languages,Countries,Genres,...,RatingRank,SuccessMetric,script_id,imdb_id,male_script,female_script,Bechdel_score,num_actors,num_women,ratio_W/M
0,233,103021,/m/0p_sc,Midnight Express,1978-08-31,35000000.0,121.0,"{""/m/064_8sq"": ""French Language"", ""/m/02h40lc""...","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/01jfsb"": ""Thriller"", ""/m/0hn10"": ""LGBT"", ...",...,0.919481,0.773523,1155,tt0077928,0.947479,0.052521,0,12.0,1.0,0.083333
1,298,5549065,/m/0ds11z,Sweeney Todd: The Demon Barber of Fleet Street,2007-12-03,152523164.0,117.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/0lsxr"": ""Crime Fiction"", ""/m/0fx2s"": ""Tra...",...,0.820454,0.822595,5083,tt0408236,0.640693,0.359307,1,13.0,3.0,0.230769
2,346,1033297,/m/0401sg,Resident Evil,2002-03-12,102441078.0,100.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",...,0.573173,0.665816,3204,tt0120804,0.605042,0.394958,3,10.0,4.0,0.4
3,555,30548208,/m/0g9wdmc,The Iron Lady,2011-12-26,114943631.0,104.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0f8l9c"": ""France"", ""/m/07ssc"": ""United Ki...","{""/m/017fp"": ""Biography"", ""/m/07s9rl0"": ""Drama""}",...,0.489887,0.656896,2634,tt1007029,0.356077,0.643923,3,15.0,10.0,0.666667
4,562,1964091,/m/069dl1,Next,2007-04-25,76066841.0,95.0,"{""/m/064_8sq"": ""French Language"", ""/m/04306rv""...","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",...,0.389811,0.400119,1190,tt0435705,0.556068,0.443932,2,12.0,2.0,0.166667


In [145]:
success_bechdel_script['incl_score'] = (success_bechdel_script['ratio_W/M']+success_bechdel_script['female_script']+success_bechdel_script['Bechdel_score']/3)/3
success_bechdel_script = success_bechdel_script.dropna(subset=['incl_score'])
success_bechdel_script.shape

(1077, 26)

In [147]:
all_metrics = success_bechdel_script.copy()
all_metrics.drop('release_date', axis=1, inplace =True)
all_metrics.drop('BoxOfficeRevenue', axis=1, inplace =True)
all_metrics.drop('num_women', axis=1, inplace =True)
all_metrics.drop('num_actors', axis=1, inplace =True)
all_metrics.drop('male_script', axis=1, inplace =True)
all_metrics.drop('female_script', axis=1, inplace =True)
all_metrics.drop('RatingRank', axis=1, inplace =True)
all_metrics.drop('BoxOfficeRank', axis=1, inplace =True)
all_metrics.drop('budget', axis=1, inplace =True)

In [148]:
all_metrics.head()

Unnamed: 0.1,Unnamed: 0,Wiki_ID,Movie_ID,title,Runtime,Languages,Countries,Genres,Year,Net_revenue,rating,SuccessMetric,script_id,imdb_id,Bechdel_score,ratio_W/M,incl_score
0,233,103021,/m/0p_sc,Midnight Express,121.0,"{""/m/064_8sq"": ""French Language"", ""/m/02h40lc""...","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/01jfsb"": ""Thriller"", ""/m/0hn10"": ""LGBT"", ...",1978,32700000.0,7.55,0.773523,1155,tt0077928,0,0.083333,0.045285
1,298,5549065,/m/0ds11z,Sweeney Todd: The Demon Barber of Fleet Street,117.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/0lsxr"": ""Crime Fiction"", ""/m/0fx2s"": ""Tra...",2007,102523164.0,7.15,0.822595,5083,tt0408236,1,0.230769,0.307803
2,346,1033297,/m/0401sg,Resident Evil,100.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",2002,69441078.0,6.5,0.665816,3204,tt0120804,3,0.4,0.598319
3,555,30548208,/m/0g9wdmc,The Iron Lady,104.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0f8l9c"": ""France"", ""/m/07ssc"": ""United Ki...","{""/m/017fp"": ""Biography"", ""/m/07s9rl0"": ""Drama""}",2011,101943631.0,6.3,0.656896,2634,tt1007029,3,0.666667,0.770197
4,562,1964091,/m/069dl1,Next,95.0,"{""/m/064_8sq"": ""French Language"", ""/m/04306rv""...","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",2007,6066841.0,6.050048,0.400119,1190,tt0435705,2,0.166667,0.425755


In [149]:
success_bechdel_script.to_csv('OurData/success_bechdel_script.csv', index=0)
all_metrics.to_csv('OurData/all_metrics.csv', index=0)