# The New Bechdel test!
Analysis Character metadata in this notebook

In [241]:
import csv
import pandas as pd
import re
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
from unidecode import unidecode
import pickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
from plotly import tools

In [3]:
mcm_df = pd.read_csv("mcm_csv.csv")

In [4]:
mcm_df.describe()

Unnamed: 0,Character_Id,Character_Name,Movie_Number,Movie_Title,Gender,Position_in_Credits
count,9033,9033,9033,9033,9033,9033
unique,9033,5355,617,617,4,57
top,u1493,MAN,m289,casino,m,?
freq,1,44,44,44,6014,6354


In [5]:
mcm_df.Gender.unique()

array(['f', 'm', 'M', 'F'], dtype=object)

In [12]:
# Since, there are 4 types of same gender characters let's condense them
mcm_df['Gender'] = mcm_df['Gender'].apply(lambda x: chr(ord(x)+32) if ord(x) > 65 and ord(x) < 90 else x)

In [13]:
mcm_df.Gender.unique()
# Now we have only two characters

array(['f', 'm'], dtype=object)

In [15]:
mcm_df.Gender.describe()

count     9033
unique       2
top          m
freq      6164
Name: Gender, dtype: object

In [18]:
mcm_df.Gender.value_counts()

m    6164
f    2869
Name: Gender, dtype: int64

# Meaning,
There are a total of : <br>
6164 Male Characters <br>
2869 Female Characters in the entire dataset. <br>

So, out of 617 movies with a total character count of 9033 in all of those movies we get : <br>
Female % = 2869 / 9033 * 100 = 31.76132% <br>
Male % = 6164 / 9033 * 100 = 63.23868% <br>
Altough the female % is quite grim, this doesn't represent us with a good background of for speaking about the diversity. <br>
Let's dig deeper <br>
# Consider the conversations metadata for Nurse Betty (m152)

In [116]:
# Lets, import the dataframe
mc_df = pd.read_csv("mc_csv.csv")

In [117]:
mc_df.describe()

Unnamed: 0,Character_Id1,Character_Id2,Movie_Number,List_of_Utterance
count,83097,83097,83097,83097
unique,5420,5608,617,83097
top,u4331,u1475,m289,"['L503448', 'L503449']"
freq,193,187,338,1


In [118]:
# So, the columns here in this dataset mention that characterId1 
# is the charcter speking to characterId2 in each movies
m_df = mc_df.loc[mc_df['Movie_Number'] == 'm152']

In [119]:
m_df.describe()

Unnamed: 0,Character_Id1,Character_Id2,Movie_Number,List_of_Utterance
count,223,223,223,223
unique,12,17,1,223
top,u2369,u2389,m152,"['L423498', 'L423499']"
freq,101,60,223,1


In [120]:
m_df.Character_Id1.unique()

array(['u2368', 'u2369', 'u2370', 'u2371', 'u2374', 'u2379', 'u2380',
       'u2381', 'u2382', 'u2385', 'u2387', 'u2388'], dtype=object)

In [121]:
m_df.Character_Id2.unique()

array(['u2375', 'u2369', 'u2387', 'u2382', 'u2371', 'u2372', 'u2388',
       'u2379', 'u2374', 'u2378', 'u2384', 'u2386', 'u2383', 'u2376',
       'u2389', 'u2377', 'u2373'], dtype=object)

In [122]:
m1_df = mcm_df.loc[mcm_df['Movie_Number'] == 'm152']
# Which has 22 characters

In [123]:
print("Number of characters:", m1_df.shape[0])
m1_df.head()

('Number of characters:', 22)


Unnamed: 0,Character_Id,Character_Name,Movie_Number,Movie_Title,Gender,Position_in_Credits
2368,u2368,BALLARD,m152,nurse betty,m,8
2369,u2369,BETTY,m152,nurse betty,f,2
2370,u2370,BLAKE,m152,nurse betty,m,?
2371,u2371,CHARLIE,m152,nurse betty,m,1
2372,u2372,CHIEF NURSE,m152,nurse betty,f,?


In [124]:
print("Number of conversations:", m_df.shape[0])
m_df.head()

('Number of conversations:', 223)


Unnamed: 0,Character_Id1,Character_Id2,Movie_Number,List_of_Utterance
21699,u2368,u2375,m152,"['L423645', 'L423646', 'L423647', 'L423648', '..."
21700,u2368,u2375,m152,"['L423656', 'L423657', 'L423658', 'L423659']"
21701,u2368,u2369,m152,"['L423341', 'L423342', 'L423343', 'L423344']"
21702,u2368,u2369,m152,"['L423714', 'L423715', 'L423716', 'L423717', '..."
21703,u2368,u2369,m152,"['L424468', 'L424469']"


In [210]:
new_df = pd.DataFrame(columns = ['Character_Id1', 'Character_Id2', 'Gender1', 
                                 'Gender2', 'List_of_Utterance', 'Movie_Number',
                                 'Movie_Title', 'Name1', 'Name2'])

In [211]:
new_df = new_df.append(m_df, ignore_index = True)

In [212]:
for index, rows in new_df.iterrows():
    rows[2] = m1_df.loc[m1_df.Character_Id == rows[0], 'Gender'].iloc[0]
    rows[3] = m1_df.loc[m1_df.Character_Id == rows[1], 'Gender'].iloc[0]
    rows[6] = m1_df.loc[m1_df.Movie_Number == rows[5], 'Movie_Title'].iloc[0]
    rows[7] = m1_df.loc[m1_df.Character_Id == rows[0], 'Character_Name'].iloc[0]
    rows[8] = m1_df.loc[m1_df.Character_Id == rows[1], 'Character_Name'].iloc[0]

In [213]:
# Our final DF for the movie
new_df

Unnamed: 0,Character_Id1,Character_Id2,Gender1,Gender2,List_of_Utterance,Movie_Number,Movie_Title,Name1,Name2
0,u2368,u2375,m,m,"['L423645', 'L423646', 'L423647', 'L423648', '...",m152,nurse betty,BALLARD,DOCTOR
1,u2368,u2375,m,m,"['L423656', 'L423657', 'L423658', 'L423659']",m152,nurse betty,BALLARD,DOCTOR
2,u2368,u2369,m,f,"['L423341', 'L423342', 'L423343', 'L423344']",m152,nurse betty,BALLARD,BETTY
3,u2368,u2369,m,f,"['L423714', 'L423715', 'L423716', 'L423717', '...",m152,nurse betty,BALLARD,BETTY
4,u2368,u2369,m,f,"['L424468', 'L424469']",m152,nurse betty,BALLARD,BETTY
5,u2368,u2387,m,m,"['L423338', 'L423339', 'L423340']",m152,nurse betty,BALLARD,ROY
6,u2368,u2387,m,m,"['L423345', 'L423346', 'L423347', 'L423348']",m152,nurse betty,BALLARD,ROY
7,u2368,u2387,m,m,"['L423350', 'L423351', 'L423352', 'L423353', '...",m152,nurse betty,BALLARD,ROY
8,u2368,u2387,m,m,"['L423592', 'L423593', 'L423594', 'L423595', '...",m152,nurse betty,BALLARD,ROY
9,u2368,u2387,m,m,"['L423605', 'L423606']",m152,nurse betty,BALLARD,ROY


In [215]:
new_df.describe()

Unnamed: 0,Character_Id1,Character_Id2,Gender1,Gender2,List_of_Utterance,Movie_Number,Movie_Title,Name1,Name2
count,223,223,223,223,223,223,223,223,223
unique,12,17,2,2,223,1,1,12,17
top,u2369,u2389,m,m,"['L423498', 'L423499']",m152,nurse betty,BETTY,WESLEY
freq,101,60,112,150,1,223,223,101,60


In [236]:
# Let's look deeper into the data for this movie
print(new_df.Gender1.value_counts())
print(new_df.Gender2.value_counts())

m    112
f    111
Name: Gender1, dtype: int64
m    150
f     73
Name: Gender2, dtype: int64


In [246]:
# Visualization on the year of realese to get the idea of movie years we are working with
trace1 = [go.Histogram(
            x=new_df['Name1'],
            marker = dict(color = 'rgb(17, 157, 100)'),
            hoverlabel = dict(bordercolor = 'rgb(0, 0, 0)')
        )]
trace2 = [go.Histogram(
            x=new_df['Name2'],
            marker = dict(color = 'rgb(17, 157, 100)'),
            hoverlabel = dict(bordercolor = 'rgb(0, 0, 0)')
        )]
data = [trace1, trace2]
layout = go.Layout(
    xaxis=dict(
        title='Years',
        domain=[0, 0.5]
    ),
    yaxis=dict(
        title='Counts of Movie Releases',
        domain=[0, 0.5]
    ),
    xaxis2=dict(
        title='Years',
        domain=[0.5, 1]
    ),
    yaxis2=dict(
        title='Counts of Movie Releases',
        domain=[0.5, 1]
    ),
    bargap=0.2
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='simple-histogram')

ValueError: 
    Invalid element(s) received for the 'data' property of 
        Invalid elements include: [[Histogram({
    'hoverlabel': {'bordercolor': 'rgb(0, 0, 0)'},
    'marker': {'color': 'rgb(17, 157, 100)'},
    'x': array(['BALLARD', 'BALLARD', 'BALLARD', ..., 'SUE ANN', 'SUE ANN', 'SUE ANN'],
               dtype=object)
})], [Histogram({
    'hoverlabel': {'bordercolor': 'rgb(0, 0, 0)'},
    'marker': {'color': 'rgb(17, 157, 100)'},
    'x': array(['DOCTOR', 'DOCTOR', 'BETTY', ..., 'WESLEY', 'WESLEY', 'WESLEY'],
               dtype=object)
})]]

    The 'data' property is a tuple of trace instances
    that may be specified as:
      - A list or tuple of trace instances
        (e.g. [Scatter(...), Bar(...)])
      - A list or tuple of dicts of string/value properties where:
        - The 'type' property specifies the trace type
            One of: ['mesh3d', 'splom', 'scattercarpet',
                     'scattergl', 'scatterternary', 'pie',
                     'surface', 'histogram', 'ohlc', 'heatmapgl',
                     'cone', 'scatterpolar', 'table',
                     'scatterpolargl', 'histogram2d', 'contour',
                     'carpet', 'box', 'violin', 'bar',
                     'contourcarpet', 'area', 'choropleth',
                     'candlestick', 'streamtube', 'parcoords',
                     'heatmap', 'barpolar', 'scattermapbox',
                     'scatter3d', 'pointcloud',
                     'histogram2dcontour', 'scatter', 'scattergeo',
                     'sankey']

        - All remaining properties are passed to the constructor of
          the specified trace type

        (e.g. [{'type': 'scatter', ...}, {'type': 'bar, ...}])