In [9]:
#Library Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from scipy import stats
from scipy.stats import ttest_ind
from statsmodels.stats import diagnostic
import re
import gensim
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

metadata_headers = ["Wikipedia_ID", "Freebase_ID", "Movie_name" , "Release_date", "box_office_revenue", "runtime", "languages","countries", "genres"]
dfmetadata = pd.read_csv('data/movie.metadata.tsv',sep='\t',header =None, names =metadata_headers)
#dfmetadata.head()

#Reading in the Data
dfsummaries=pd.read_table('data/plot_summaries.txt',header=None,names=["Wikipedia_ID","Summary"])
#dfsummaries.head()
character_head = ["Wikipedia_ID","Freebase_ID", "Release_date", "Character_name", "Actor_date_of_birth", "Gender", "Height", "Ethnicity", "Actor_name", "Age","Freebase_character/actor_map_ID", "Freebase_character_ID", "Freebase_actor_ID"]
dfcharacter =pd.read_csv('data/character.metadata.tsv',sep='\t',header =None, names =character_head)

## Statistics 1: Is the relationship between genre on female actresses significant? Answer: No.

## <span style="color: blue;"> This is all old stuff, reading in and transforming the data....

In [10]:
#In order to investigate women in different genres, we need to merge the metadata and character data.
meta_char_merged=dfcharacter[['Freebase_ID','Actor_name','Gender']].merge(dfmetadata[['Freebase_ID','Movie_name','genres']],on="Freebase_ID")

In [11]:
#First, we need to change the type from a string representation to a dictionary
example_entry1 = meta_char_merged.genres[1]
import ast
if type(example_entry1)==str: 
    meta_char_merged.genres=meta_char_merged.genres.apply(lambda x: ast.literal_eval(x))

#Now, we can create a df where the genres are all in different columns
df_dict1 = pd.json_normalize(meta_char_merged.genres)

#We can see that the headers for genres are IDs, so let's change them to more intuitive headers.
new_column_names = []

#We will take the first most frequent not NaN mode to be the name of the column because this is also the name of the genre
for column in df_dict1.columns:
    name = df_dict1[column].mode()[0]
    new_column_names.append(name)
    
df_dict1.columns=new_column_names

#Now, we can merge the df with the genre infos to our original dataframe with the rest of the information (movie name, gender etc)
meta_char_merged2 = pd.concat([meta_char_merged.drop('genres',axis=1), df_dict1], axis=1)

In [12]:
meta_char_merged2

Unnamed: 0,Freebase_ID,Actor_name,Gender,Movie_name,Thriller,Science Fiction,Horror,Adventure,Supernatural,Action,...,Comdedy,Children's Issues,Statutory rape,Breakdance,War effort,Revenge,Romantic thriller,Chick flick,Buddy Picture,Homoeroticism
0,/m/03vyhn,Wanda De Jesus,F,Ghosts of Mars,Thriller,Science Fiction,Horror,Adventure,Supernatural,Action,...,,,,,,,,,,
1,/m/03vyhn,Natasha Henstridge,F,Ghosts of Mars,Thriller,Science Fiction,Horror,Adventure,Supernatural,Action,...,,,,,,,,,,
2,/m/03vyhn,Ice Cube,M,Ghosts of Mars,Thriller,Science Fiction,Horror,Adventure,Supernatural,Action,...,,,,,,,,,,
3,/m/03vyhn,Jason Statham,M,Ghosts of Mars,Thriller,Science Fiction,Horror,Adventure,Supernatural,Action,...,,,,,,,,,,
4,/m/03vyhn,Clea DuVall,F,Ghosts of Mars,Thriller,Science Fiction,Horror,Adventure,Supernatural,Action,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450664,/m/03pcrp,Dorothy Elias-Fahn,F,The Super Dimension Fortress Macross II: Lover...,,Science Fiction,,Adventure,,,...,,,,,,,,,,
450665,/m/03pcrp,Jonathan Fahn,M,The Super Dimension Fortress Macross II: Lover...,,Science Fiction,,Adventure,,,...,,,,,,,,,,
450666,/m/0cp05t9,David Hemmings,M,Five Clues to Fortune,,,,,,,...,,,,,,,,,,
450667,/m/0cp05t9,Roberta Paterson,,Five Clues to Fortune,,,,,,,...,,,,,,,,,,


In [13]:
#Next, we are interested in how gender representation varies in different genres
#for this we will count the values in the genre columns
gender_counts_init = meta_char_merged2.drop(['Freebase_ID','Actor_name','Movie_name'],axis=1).groupby('Gender').count()
gender_counts_init

Unnamed: 0_level_0,Thriller,Science Fiction,Horror,Adventure,Supernatural,Action,Space western,Mystery,Biographical film,Drama,...,Comdedy,Children's Issues,Statutory rape,Breakdance,War effort,Revenge,Romantic thriller,Chick flick,Buddy Picture,Homoeroticism
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
F,21270,6626,11659,9717,1951,16203,36,8042,2578,71760,...,2,1,3,5,2,2,1,8,7,0
M,48159,16351,21129,27074,3299,49184,118,16027,5526,133722,...,7,2,5,5,1,6,0,10,29,7


In [14]:
gender_counts=gender_counts_init.transpose()

In [15]:
#Let's add some more interesting columns!
#First, i want one for the total counts to later look at the most common genres
gender_counts['total_count']=gender_counts.F+gender_counts.M
#Next, we are also also interested in the percentage of women
gender_counts['F_percentage']=(gender_counts.F/(gender_counts.F+gender_counts.M))*100

#Finally, we want reset the index so that genre is a normal column:
gender_counts_2=gender_counts.reset_index()
gender_counts_2=gender_counts_2.rename(columns={'index':'Genre'})

gender_counts_2

Gender,Genre,F,M,total_count,F_percentage
0,Thriller,21270,48159,69429,30.635613
1,Science Fiction,6626,16351,22977,28.837533
2,Horror,11659,21129,32788,35.558741
3,Adventure,9717,27074,36791,26.411351
4,Supernatural,1951,3299,5250,37.161905
...,...,...,...,...,...
356,Revenge,2,6,8,25.000000
357,Romantic thriller,1,0,1,100.000000
358,Chick flick,8,10,18,44.444444
359,Buddy Picture,7,29,36,19.444444


## <span style="color: blue;"> New stuff starts here:


In [26]:
df_simplified = gender_counts_2[['Genre', 'F', 'M']]

# Pivot the table
df_pivoted = df_simplified.pivot_table(index='Genre', values=['F', 'M'])

# Transpose the DataFrame to have genders as rows and genres as columns
df_contingency = df_pivoted.T
df_contingency

Genre,Absurdism,Acid western,Action,Action Comedy,Action Thrillers,Action/Adventure,Addiction Drama,Adult,Adventure,Adventure Comedy,...,Werewolf fiction,Western,Whodunit,Women in prison films,Workplace Comedy,World History,World cinema,Wuxia,Z movie,Zombie Film
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
F,342,12,16203,399,1086,9744,173,449,9717,394,...,12,1263,195,64,386,4,16599,176,5,560
M,895,78,49184,1224,3496,31768,273,433,27074,1213,...,33,5011,449,39,632,15,31152,445,20,1219


In [27]:
df_contingency = df_contingency.loc[:, ~(df_contingency == 0).any(axis=0)]

In [29]:
# Perform the chi-square test
chi2, p, dof, expected = stats.chi2_contingency(df_contingency)

print("Chi-square statistic:", chi2)
print("P-value:", p)
print("Degrees of freedom:", dof)
#print("Expected frequencies:\n", expected)

# Interpret the result
if p < 0.05:
    print("There is a significant association between genre and gender.")
else:
    print("There is no significant association between genre and gender.")

Chi-square statistic: 18855.181474866225
P-value: 0.0
Degrees of freedom: 354
There is a significant association between genre and gender.


## Statistics 2: Is the relationship between character types and female actresses significant? Answer: yes. 

## <span style="color: blue;"> Again, old stuff:

In [33]:
#We can see that the infos about character, movie, id and actor are all in one column.
df_clusters_initial=pd.read_table('data/tvtropes.clusters.txt',header=None)
df_clusters_initial

example_entry = df_clusters_initial[1][1]

#We need to turn the string representations of dictionaries into actual dictionaries
#(The if statement is just so the cell can be run without having run the previous cells, to avoid confusion)
import ast
if type(example_entry)==str: 
    df_clusters_initial[1]=df_clusters_initial[1].apply(lambda x: ast.literal_eval(x))
    
#Now, we can separate the values in the dictionaries to different columns
df_dict = pd.json_normalize(df_clusters_initial[1])

#Adding the separated dictionary values to the initial df
dfclusters = pd.concat([df_clusters_initial.drop(1,axis=1), df_dict], axis=1)

dfclusters=dfclusters.rename(columns={0:"type",'actor':'Actor_name','id':'Freebase_character/actor_map_ID'})
#dfclusters

In [34]:
#We only need the info on genders from the character df so I do this to avoid having to drop many columns later
clusters_merged = dfclusters.merge(dfcharacter[['Freebase_character/actor_map_ID', 'Gender']],on="Freebase_character/actor_map_ID")
clusters_merged

#grouping by type and gender
types = clusters_merged.groupby(['type', 'Gender']).size().unstack(fill_value=0)

#adding a column for total M+F to see which types are most comon
types['total_count'] = types['F'] + types['M']

#adding a column for the percentage of women
types['F_percentage'] = (types['F'] / (types['F'] + types['M']))*100

# Resetting index to make 'type' a column
types.reset_index(inplace=True)

In [35]:
clusters_merged

Unnamed: 0,type,char,movie,Freebase_character/actor_map_ID,Actor_name,Gender
0,absent_minded_professor,Professor Philip Brainard,Flubber,/m/0jy9q0,Robin Williams,M
1,absent_minded_professor,Professor Keenbean,Richie Rich,/m/02vchl3,Michael McShane,M
2,absent_minded_professor,Dr. Reinhardt Lane,The Shadow,/m/0k6fkc,Ian McKellen,M
3,absent_minded_professor,Dr. Harold Medford,Them!,/m/0k6_br,Edmund Gwenn,M
4,absent_minded_professor,Daniel Jackson,Stargate,/m/0k3rhh,James Spader,M
...,...,...,...,...,...,...
496,young_gun,Morgan Earp,Tombstone,/m/0k776f,Bill Paxton,M
497,young_gun,Colorado Ryan,Rio Bravo,/m/0k2kqg,Ricky Nelson,M
498,young_gun,Tom Sawyer,The League of Extraordinary Gentlemen,/m/0k5nsh,Shane West,M
499,young_gun,William H. 'Billy the Kid' Bonney,Young Guns II,/m/03lrjk0,Emilio Estevez,M


In [36]:
types.sort_values(by='F_percentage',ascending=False).head(15)

Gender,type,F,M,total_count,F_percentage
69,valley_girl,6,0,6,100.0
53,prima_donna,6,0,6,100.0
51,ophelia,5,0,5,100.0
44,junkie_prophet,2,0,2,100.0
12,chanteuse,6,0,6,100.0
15,classy_cat_burglar,1,0,1,100.0
34,final_girl,6,0,6,100.0
27,dumb_blonde,10,1,11,90.909091
6,broken_bird,4,1,5,80.0
5,brainless_beauty,7,5,12,58.333333


## <span style="color: blue;"> New stuff:

In [37]:
#Doing a chi squared test

contingency_table = pd.crosstab(clusters_merged['Gender'], clusters_merged['type'])

chi2, p_value, _, _ = stats.chi2_contingency(contingency_table)

print(f"Chi-squared statistic: {chi2}")
print(f"P-value: {p_value}")

Chi-squared statistic: 369.8513527525936
P-value: 5.592160386860096e-42


In [38]:
contingency_table

type,absent_minded_professor,adventurer_archaeologist,arrogant_kungfu_guy,big_man_on_campus,bounty_hunter,brainless_beauty,broken_bird,bromantic_foil,bruiser_with_a_soft_center,bully,...,storyteller,stupid_crooks,surfer_dude,the_chief,the_editor,tranquil_fury,trickster,valley_girl,warrior_poet,young_gun
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
F,0,1,1,0,0,7,4,0,0,1,...,1,0,0,0,0,0,0,6,0,0
M,5,3,8,7,10,5,1,5,3,2,...,3,5,9,3,2,7,5,0,9,6
