# Simple gender analysis

- ## Imports

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import unicodedata

import os, sys, json, re, argparse, urllib2, html5lib
from bs4 import BeautifulSoup, Tag, UnicodeDammit
import pandas as pd
from pandas.io.json import json_normalize
import time
import math
from parse_script import *
from crawler import *




- ## Crawl

- ### Extract functions

In [2]:
path='../data/top100 per year/curated/'
col=['classement','titre','annee','annee_','categorie','realisateur','url','url_script']
titles=pd.DataFrame(columns=col)
for year in range(2000,2017):
    df=get_curated(year,path)
    df['annee_']=year
    titles = pd.concat([titles,df])

In [3]:
#retraiter la colonne catégorie
titles.reset_index(inplace=True,drop=True)
titles['categorie']=titles['categorie'].apply(lambda x : x.replace('\n','').replace(' ',''))
titles['categorie1'], titles['categorie2'], titles['categorie3']= titles['categorie'].str.split(',', 2).str

In [4]:
#consolider le dataframe pour les liens qui ont un lien script
titles = titles.dropna(subset=['url_script'],how='any')
titles.reset_index(inplace=True,drop=True)
titles

Unnamed: 0,annee,annee_,categorie,classement,durée,realisateur,titre,url,url_script,categorie1,categorie2,categorie3
0,(2000),2000.0,"Adventure,Drama,Romance",2.0,143 min,Robert Zemeckis,Cast Away,http://www.imsdb.com/Movie Scripts/Cast Away S...,http://www.imsdb.com/scripts/Cast-Away.html,Adventure,Drama,Romance
1,(2000),2000.0,"Action,Adventure,Drama",4.0,155 min,Ridley Scott,Gladiator,http://www.imsdb.com/Movie Scripts/Gladiator S...,http://www.imsdb.com/scripts/Gladiator.html,Action,Adventure,Drama
2,(2000),2000.0,"Action,Adventure,Sci-Fi",8.0,104 min,Bryan Singer,X-Men,http://www.imsdb.com/Movie Scripts/X-Men Scrip...,http://www.imsdb.com/scripts/X-Men.html,Action,Adventure,Sci-Fi
3,(2000),2000.0,"Drama,Fantasy,Horror",10.0,130 min,Robert Zemeckis,What Lies Beneath,http://www.imsdb.com/Movie Scripts/What Lies B...,http://www.imsdb.com/scripts/What-Lies-Beneath...,Drama,Fantasy,Horror
4,(2000),2000.0,"Action,Adventure,Fantasy",12.0,120 min,Ang Lee,"Crouching Tiger, Hidden Dragon",http://www.imsdb.com/Movie Scripts/Crouching T...,"http://www.imsdb.com/scripts/Crouching-Tiger,-...",Action,Adventure,Fantasy
5,(2000),2000.0,"Biography,Drama",13.0,131 min,Steven Soderbergh,Erin Brockovich,http://www.imsdb.com/Movie Scripts/Erin Brocko...,http://www.imsdb.com/scripts/Erin-Brockovich.html,Biography,Drama,
6,(2000),2000.0,"Action,Adventure,Comedy",14.0,98 min,McG,Charlie's Angels,http://www.imsdb.com/Movie Scripts/Charlie's A...,http://www.imsdb.com/scripts/Charlie's-Angels....,Action,Adventure,Comedy
7,(2000),2000.0,"Crime,Drama,Thriller",15.0,147 min,Steven Soderbergh,Traffic,http://www.imsdb.com/Movie Scripts/Traffic Scr...,http://www.imsdb.com/scripts/Traffic.html,Crime,Drama,Thriller
8,(2000),2000.0,"Drama,Mystery,Sci-Fi",23.0,106 min,M. Night Shyamalan,Unbreakable,http://www.imsdb.com/Movie Scripts/Unbreakable...,http://www.imsdb.com/scripts/Unbreakable.html,Drama,Mystery,Sci-Fi
9,(2000),2000.0,"Horror,Mystery",27.0,116 min,Wes Craven,Scream 3,http://www.imsdb.com/Movie Scripts/Scream 3 Sc...,http://www.imsdb.com/scripts/Scream-3.html,Horror,Mystery,


In [5]:
#les différentes catégories:
l=list(titles.categorie1)+list(titles.categorie2)+list(titles.categorie3)
set(l)

{nan,
 u'Action',
 u'Adventure',
 u'Animation',
 u'Biography',
 u'Comedy',
 u'Crime',
 u'Drama',
 u'Family',
 u'Fantasy',
 u'History',
 u'Horror',
 u'Music',
 u'Musical',
 u'Mystery',
 u'Romance',
 u'Sci-Fi',
 u'Sport',
 u'Thriller',
 u'War',
 u'Western'}

- ### Analysis Functions

In [None]:
def clean_character(char):
    if isinstance(char,unicode) or isinstance(char,str):
        char=char.replace('\n','').replace('*','').replace('O/S','').replace('V.O.','')
        char=re.sub(' +',' ',char)
        if 'CONTINUED' in char:
            return u'CONTINUED'
        if '(' in char or ')' in char:
            regex = re.compile(".*?\((.*?)\)")
            to_delete = re.findall(regex, char)
            for e in to_delete:
                char=char.replace('(%s)' %e,'')
            if char=='' or ')'in char:
                return None
            else:
                return char.rstrip() #delete lose end white space
        else:
            return char.rstrip()
    else:
        return None
    
def continued_speech(char_vector):
    char_vector=list(char_vector)
    while 'CONTINUED' in char_vector:
        count=0
        idx=char_vector.index("CONTINUED")
        for element in char_vector[idx-1::-1]:
            #if (isinstance(element,str) or isinstance(element,unicode)) and (element!='unknown'):
            if element and element!='unknown':
                char_vector[idx]=char_vector[idx-1-count]
                #print('Replaces CONTINUED with %s' %(char_vector[idx-1-count]))
                break
            count=count+1
            if count==len(char_vector[idx-1::-1]):
                char_vector[idx]='unknown'
                break
    return char_vector


# to remove outliers
def principal_characters(char_dic):
    mean = 0
    for i, j in char_dic.items():
        mean = mean + j
    mean = mean / len(char_dic)

    stddev = 0
    for i, j in char_dic.items():
        stddev = stddev + (j - mean)*(j - mean)
    stddev = stddev / len(char_dic)
    stddev = math.sqrt(stddev)

    temp = {}
    for i, j in char_dic.items():
        if (math.fabs(j - mean) < 3*stddev):
            temp[i] = j
    print(stddev,mean)
    return temp


def char_analysis(df):
    d=dict(Counter(df['character_clean']))
    d={key: value for key, value in d.items() 
                 if key}
    charlist = sorted(d)
    gender = dict()
    lines=[element.split('\n') for element in df['text']]
    lines=[item for sublist in lines for item in sublist]

    for char in charlist:
        m, f, p = 0, 0, 0 
        for i in range(len(lines)):
            srch = re.search(char, lines[i], re.I)
            if not srch:
                continue
            if srch.group().isupper():
                continue
            else:
                k, k_range = -1, 2
                while k + i + 1 < len(lines):
                    k = k + 1
                    m = m + len(re.findall("\s+he", lines[k+i], re.I)) + len(re.findall("\s+him", lines[k+i], re.I))
                    f = f + len(re.findall("\s+she", lines[k+i], re.I)) + len(re.findall("\s+her", lines[k+i], re.I))
                    if lines[k+i].isupper():
                        break
                    if k >= k_range:
                        break
                k, k_range = -1, 4
                while k + i + 1 < len(lines):
                    k = k + 1
                    m1, m2, m3, m4 = re.search("\s+he", lines[k+i], re.I), re.search("\s+him", lines[k+i], re.I),\
                                    re.search("\s+she", lines[k+i], re.I), re.search("\s+her", lines[k+i], re.I)
                    if m1:
                        if m2:
                            mc = min(m1.span()[0], m2.span()[0])
                        else:
                            mc = m1.span()[0]
                    else:
                        mc = len(lines[k+i])
                    if m3:
                        if m4:
                            fc = min(m3.span()[0], m4.span()[0])
                        else:
                            fc = m3.span()[0]
                    else:
                        fc = len(lines[k+i])
                    if not(m1 or m2 or m3 or m4):
                        continue
                    else:
                        if (mc < fc):
                            p = p - 1
                        else:
                            p = p + 1
                        break
                    if lines[k+i].isupper():
                        break
                    if k >= k_range:
                        break
        gender[char] = m, f, p

    for char, g in gender.items():
        g_score = 2*g[1] - g[0] 
        g_sum = g[1]+g[0]
        g_score = g_score
        if (g_sum == 0.0) :#or (0 == 1):
            #print(char+" Undetermined ")
            gender[char] = "?" #0, 0, -1
        elif g_score > 0:
            #print(char+" Female "+str(g[0])+","+str(g[1])+","+str(g[2])+","+str(g_score))
            gender[char] = "F" #0, 0, 1
        else:
            #print(char+" Male", " ", str(g[0])+","+str(g[1])+","+str(g[2])+","+str(g_score))
            gender[char] = "M" #0, 0, 0
            
    return d,gender

- ### Script loader and parser

In [None]:
%%time
actions_url = list(titles[(titles['categorie1']=='Action') | (titles['categorie2']=='Action')]['url_script'])
output_actions=[]
nb_of_errors=0
for url in actions_url:
    path='../data/script/actions'
    name=url.replace(':','').replace('/','')
    try:
        test=parse(url,path,name)
        output_actions.append(test)
    except Exception as e:
        print(e)
        nb_of_errors=nb_of_errors+1

Getting script @ http://www.imsdb.com/scripts/Gladiator.html.
      Done parsing script at http://www.imsdb.com/scripts/Gladiator.html in 1.08648705482
-----------------
Found a <pre> inside the <pre>
Getting script @ http://www.imsdb.com/scripts/X-Men.html.
list index out of range
Getting script @ http://www.imsdb.com/scripts/Crouching-Tiger,-Hidden-Dragon.html.

from os import listdir
from os.path import isfile, join
onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]

In [None]:
print('The number of errors is %s out of a total %s.' %(nb_of_errors,len(actions_url)))

In [None]:
#construct character info and add it to dataframe
output_actions_=[]
male_nb_speech=[]
female_nb_speech=[]
for df in output:
    try:
        #print(df['text'][0].split('\n')[0])
        test=df['character'].apply(lambda x :clean_character(x))
        df['character_clean']=continued_speech(test)
        char,gender=char_analysis(df)
        output_actions_.append(char)
        #create gender column in dataframe:
        df['gender']=[gender.get(item,item)  for item in list(df['character_clean'])]
        male_nb_speech.append(len(df[df['gender']=='M']))
        female_nb_speech.append(len(df[df['gender']=='F']))
    except Exception as e:
        print('------Caught exception : %s' %e)
        nb_of_errors=nb_of_errors+1

In [None]:
print('The number of errors is %s out of a total %s.' %(nb_of_errors,len(actions_url)))

In [None]:
output[1][105:180]

In [None]:
m=sum(male_nb_speech)
f=sum(female_nb_speech)

 
# Data to plot
labels = ["M","F"]
sizes = [m, f]
colors = ['lightskyblue', 'pink']
plt.figure(figsize=(8,8))
plt.pie(sizes, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.title("Repartition des repliques dans les films d'actions") 
plt.axis('equal')
plt.show()

In [None]:
i=1
male_nb_speech=len(output[i][output[i]['gender']=='M'])
female_nb_speech=len(output[i][output[i]['gender']=='F'])

 
# Data to plot
labels = ["M","F"]
sizes = [male_nb_speech, female_nb_speech]
colors = ['lightskyblue', 'pink']
plt.figure(figsize=(8,8))
plt.pie(sizes, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.title('Repartition des repliques dans %s' %output[i]['text'][0].split('\n')[0]) 
plt.axis('equal')
plt.show()

In [None]:
i=10
male_nb_speech=len(output[i][output[i]['gender']=='M'])
female_nb_speech=len(output[i][output[i]['gender']=='F'])

 
# Data to plot
labels = ["M","F"]
sizes = [male_nb_speech, female_nb_speech]
colors = ['lightskyblue', 'pink']
plt.figure(figsize=(8,8))
plt.pie(sizes, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.title('Repartition des repliques dans %s' %output[i]['text'][0].split('\n')[0]) 
plt.axis('equal')
plt.show()

In [None]:
%%time
romances_url = list(titles[(titles['categorie1']=='Romance') | (titles['categorie2']=='Romance')]['url_script'])
output_romances=[]
nb_of_errors=0
for url in romances_url:
    path='../data/script/romances'
    name=url.replace(':','').replace('/','')
    try:
        test=parse(url,path,name)
        output_romances.append(test)
    except Exception as e:
        print(e)
        nb_of_errors=nb_of_errors+1

In [None]:
print('The number of errors is %s out of a total %s.' %(nb_of_errors,len(actions_url)))

In [None]:
#construct character info and add it to dataframe
output_romances_=[]
male_nb_speech=[]
female_nb_speech=[]
for df in output:
    try:
        #print(df['text'][0].split('\n')[0])
        test=df['character'].apply(lambda x :clean_character(x))
        df['character_clean']=continued_speech(test)
        char,gender=char_analysis(df)
        output_romances_.append(char)
        #create gender column in dataframe:
        df['gender']=[gender.get(item,item)  for item in list(df['character_clean'])]
        male_nb_speech.append(len(df[df['gender']=='M']))
        female_nb_speech.append(len(df[df['gender']=='F']))
    except Exception as e:
        print('------Caught exception : %s' %e)
        nb_of_errors=nb_of_errors+1

In [None]:
print('The number of errors is %s out of a total %s.' %(nb_of_errors,len(actions_url)))

In [None]:
m=sum(male_nb_speech)
f=sum(female_nb_speech)

 
# Data to plot
labels = ["M","F"]
sizes = [m, f]
colors = ['lightskyblue', 'pink']
plt.figure(figsize=(8,8))
plt.pie(sizes, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.title("Repartition des repliques dans les films romantiques") 
plt.axis('equal')
plt.show()

In [None]:
def find(df,to_find):
    count=0
    for element in df['text']:
        if to_find in normalize_text(element):
            count+=1
    return count

def find_female_in(df,to_find):
    count_female=0
    count_total=0
    idx=0
    try:
        list_female=list(df[df['gender']=='F']['character_clean'].apply(lambda x : normalize_text(x)))
        list_female=list(set(list_female))
        #list_female.append('she').append('her')
    except Exception as e:
        try:
            
            for element in df['text']:
                if to_find in normalize_text(element):
                    count_total+=1
                    if 'she' in element or "her" in element:
                        count_female+=1
            return count_female,count_total
        except Exception as ex:
            print('Warning : %s.' %(ex))
            return 0,0
    list_female.append('she')
    list_female.append("her")
    for element in df['text']:
        if to_find in normalize_text(element):
            count_total+=1
            for indicator in list_female: #she or her
                if indicator in element:
                    count_female+=1
                    break
        idx+=1
    return count_female,count_total

def find_any(df,l):
    for element in df['text']:
        for room in l:
            if room in normalize_text(element):
                print("Found %s" %room)
    return

def normalize_text(text):
    text = text.lower()
    normal = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore')
    return normal

In [None]:
l=['KITCHEN','Kitchen','kitchen','bed','BED','BEDROOM','Bed','Bedroom','bedroom','BATH','bath','Bath','BATHROOM',
  'Bathroom','bathroom','toilet','TOILET','Toilet']
l_norm=['kitchen','bed','bedroom','bath','bathroom','toilet']

In [None]:
kitchen_count_action=0
kitchen_count_romance=0
for df in output_actions:
    kitchen_count_action += find(df,'kitchen')
for df in output_romances:
    kitchen_count_romance += find(df,'kitchen')

In [None]:
# Data to plot
labels = ["in actions","in romance"]
sizes = [kitchen_count_action, kitchen_count_romance]
colors = ['lightskyblue', 'pink']
plt.figure(figsize=(8,8))
plt.pie(sizes, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.title("Repartition de la presence des cuisines dans les films actions-romances") 
plt.axis('equal')
plt.show()

print('Actions: %s de présence cuisines' %(kitchen_count_action))
print('Romances: %s  de présence cuisines' %(kitchen_count_romance))

In [None]:
kitchen_count_action_f=0
kitchen_count_action =0
kitchen_count_romance_f=0
kitchen_count_romance = 0
for df in output_actions:
    kitchen_count_action_f += find_female_in(df,'kitchen')[0]
    kitchen_count_action += find_female_in(df,'kitchen')[1]
for df in output_romances:
    kitchen_count_romance_f += find_female_in(df,'kitchen')[0]
    kitchen_count_romance += find_female_in(df,'kitchen')[1]

In [None]:
# Data to plot
labels = ["in actions","in romance"]
sizes = [kitchen_count_action_f, kitchen_count_romance_f]
colors = ['lightskyblue', 'pink']
plt.figure(figsize=(8,8))
plt.pie(sizes, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.title("Repartition de la presence des cuisines dans les films actions-romances") 
plt.axis('equal')
plt.show()

print('Actions: %s femmes & cuisines' %(kitchen_count_action_f))
print('Romances: %s femmes & cuisines' %(kitchen_count_romance_f))

In [None]:
# Data to plot
labels = ["other","women"]
sizes = [kitchen_count_action - kitchen_count_action_f, kitchen_count_action_f]
colors = ['lightskyblue', 'pink']
plt.figure(figsize=(8,8))
plt.pie(sizes, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.title("Repartition H/F & cuisne dans les films d'actions") 
plt.axis('equal')
plt.show()

print(' %s femmes & cuisines' %(kitchen_count_action_f))
print('%s autres & cuisines' %(kitchen_count_action- kitchen_count_action_f))

In [None]:
# Data to plot
labels = ["other","women"]
sizes = [kitchen_count_romance - kitchen_count_romance_f, kitchen_count_romance_f]
colors = ['lightskyblue', 'pink']
plt.figure(figsize=(8,8))
plt.pie(sizes, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.title("Repartition H/F & cuisine dans les films d'actions") 
plt.axis('equal')
plt.show()

print('Actions: %s femmes & cuisines' %(kitchen_count_action_f))
print(': %s femmes & cuisines' %(kitchen_count_romance_f))