# Project Milestone 2

#### Useful libraries

In [114]:
import pandas as pd
import json

#### Building dataframes from files

In [115]:
# Character metadata
char_md_cols = ['Wikipedia movie ID',
'Freebase movie ID',
'Movie release date',
'Character name',
'Actor date of birth',
'Actor gender',
'Actor height (in meters)',
'Actor ethnicity (Freebase ID)',
'Actor name',
'Actor age at movie release',
'Freebase character/actor map ID',
'Freebase character ID',
'Freebase actor ID',
]
char_md = pd.read_csv('data/character.metadata.tsv', sep='\t', names=char_md_cols)
char_md.head(2)

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie release date,Character name,Actor date of birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4


In [116]:
# Movie metadata
movie_md_cols = [
'Wikipedia movie ID',
'Freebase movie ID',
'Movie name',
'Movie release date',
'Movie box office revenue',
'Movie runtime',
'Movie languages (Freebase ID:name tuples)',
'Movie countries (Freebase ID:name tuples)',
'Movie genres (Freebase ID:name tuples)',
]
movie_md = pd.read_csv('data/movie.metadata.tsv', sep='\t', names=movie_md_cols)
movie_md.head(2)

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages (Freebase ID:name tuples),Movie countries (Freebase ID:name tuples),Movie genres (Freebase ID:name tuples)
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."


In [117]:
# Plot summaries
summaries_cols = [
'Wikipedia movie ID',
'Summary',
]
summaries_df = pd.read_csv('data/plot_summaries.txt', sep='\t', names=summaries_cols)
summaries_df.head(2)

Unnamed: 0,Wikipedia movie ID,Summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...


In [118]:
# Stereotypes
stereotypes_cols = [
'Stereotype',
'Character Info',
]
stereotypes_df = pd.read_csv('data/tvtropes.clusters.txt', sep='\t', names=stereotypes_cols)
df_stereotypes = pd.concat([stereotypes_df.drop(['Character Info'], axis=1), stereotypes_df['Character Info'].apply(json.loads).apply(pd.Series)], axis=1)
df_stereotypes.head()

Unnamed: 0,Stereotype,char,movie,id,actor
0,absent_minded_professor,Professor Philip Brainard,Flubber,/m/0jy9q0,Robin Williams
1,absent_minded_professor,Professor Keenbean,Richie Rich,/m/02vchl3,Michael McShane
2,absent_minded_professor,Dr. Reinhardt Lane,The Shadow,/m/0k6fkc,Ian McKellen
3,absent_minded_professor,Dr. Harold Medford,Them!,/m/0k6_br,Edmund Gwenn
4,absent_minded_professor,Daniel Jackson,Stargate,/m/0k3rhh,James Spader


### Stereotypes from tvtropes, which are the most prominent ones ?

In [119]:
# How many stereotypes are there?
df = pd.merge(df_stereotypes, char_md, left_on="id", right_on="Freebase character/actor map ID")
df['Stereotype'].nunique()

72

In [120]:
# How many males and females are there?
print(df['Actor gender'].value_counts()['M'])
print(df['Actor gender'].value_counts()['F'])

431
69


In [125]:
# What are the top n more common stereotypes?
n = 10
top_n_stereotypes = df['Stereotype'].value_counts()[:n].index.tolist()
top_n_stereotypes

['crazy_jealous_guy',
 'corrupt_corporate_executive',
 'byronic_hero',
 'psycho_for_hire',
 'father_to_his_men',
 'stoner',
 'master_swordsman',
 'brainless_beauty',
 'slacker',
 'dumb_blonde']

In [122]:
# What are the top n more common stereotypes in males?
df_male = df[df['Actor gender'] == 'M']
df_male['Stereotype'].value_counts()[:n].index.tolist()

['crazy_jealous_guy',
 'corrupt_corporate_executive',
 'byronic_hero',
 'psycho_for_hire',
 'father_to_his_men',
 'stoner',
 'master_swordsman',
 'bounty_hunter',
 'hitman_with_a_heart',
 'slacker']

In [123]:
# What are the top n more common stereotypes in females?
df_female = df[df['Actor gender'] == 'F']
df_female['Stereotype'].value_counts()[:n].index.tolist()

['dumb_blonde',
 'brainless_beauty',
 'valley_girl',
 'chanteuse',
 'prima_donna',
 'final_girl',
 'ophelia',
 'granola_person',
 'broken_bird',
 'junkie_prophet']

In [124]:
# How many males and females are there in each of the top n stereotypes?
males_n_stereotypes = 0
females_n_stereotypes = 0
for stereotype in top_n_stereotypes:
    df_temp = df[df['Stereotype'] == stereotype]
    try:
        males = df_temp['Actor gender'].value_counts()['M']
    except:
        males = 0
    try:
        females = df_temp['Actor gender'].value_counts()['F']
    except:
        females = 0
    males_n_stereotypes += males
    females_n_stereotypes += females
    print(stereotype + ': ' + str(males) + ' M and ' + str(females) + ' F.')
    
print(str(males_n_stereotypes) + ' M and ' + str(females_n_stereotypes) + 'F in the top ' + str(n) + ' stereotypes')

crazy_jealous_guy: 25 M and 0 F.
corrupt_corporate_executive: 23 M and 0 F.
byronic_hero: 17 M and 0 F.
psycho_for_hire: 16 M and 0 F.
father_to_his_men: 15 M and 0 F.
stoner: 13 M and 0 F.
master_swordsman: 12 M and 0 F.
brainless_beauty: 5 M and 7 F.
slacker: 9 M and 2 F.
dumb_blonde: 1 M and 10 F.
136 M and 19F in the top 10 stereotypes


### Representation of men vs women in the different movie genres.