In [160]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from pathlib import Path
import math

In [161]:
pd.set_option("display.max_rows", 150, "display.max_columns", 100)

In [162]:
# dialogue data for each character
dialogpath = Path('project_data/movie_dialogue.tsv')
dialogue = pd.read_csv(dialogpath, sep = '\t')

dialogue.nunique()

mid           600
cid          2969
cname        1925
mname         600
gender          4
wordcount    1428
year           74
genres        283
comedy          2
thriller        2
drama           2
romance         2
lines        2969
dtype: int64

In [163]:
# movie writer gender data
genderpath = Path('project_data/dialogue_creators_gender_with_counts.csv')
genderdf = pd.read_csv(genderpath, encoding = 'utf-8')
genderdf

Unnamed: 0.1,Unnamed: 0,mid,imdb_id,writer_ids,writer_names,writer_gender,num_of_female_writers,total_num_of_writers
0,0,m536,tt0057012,"['nm0000040', 'nm0816143', 'nm0313570']","['Stanley Kubrick', 'Terry Southern', 'Peter G...","['M', 'M', 'M']",0,3
1,1,m373,tt0119217,"['nm0000354', 'nm0000255']","['Matt Damon', 'Ben Affleck']","['M', 'M']",0,2
2,2,m126,tt0181689,"['nm0001140', 'nm0291082', 'nm0169509']","['Philip K. Dick', 'Scott Frank', 'Jon Cohen']","['M', 'M', 'M']",0,3
3,3,m415,tt0067309,"['nm0506920', 'nm5557134']","['Andy Lewis', 'David E. Lewis']","['M', 'M']",0,2
4,4,m140,tt0027996,"['nm0728307', 'nm0445502']","['Robert Riskin', 'Clarence Budington Kelland']","['M', 'M']",0,2
...,...,...,...,...,...,...,...,...
590,590,m138,tt0177023,['nm0681914'],['Rex Pickett'],['M'],0,1
591,591,m242,tt0118617,"['nm0310319', 'nm0333949', 'nm0879318', 'nm092...","['Susan Gauthier', 'Bruce Graham', 'Bob Tzudik...","['F', 'M', 'M', 'F', 'M']",2,5
592,592,m177,tt0083067,"['nm0639782', 'nm0788940']","[""Richard O'Brien"", 'Jim Sharman']","['M', 'M']",0,2
593,593,m153,tt0190590,"['nm0392955', 'nm0001053', 'nm0001054']","['Homer', 'Ethan Coen', 'Joel Coen']","['M', 'M', 'M']",0,3


In [164]:
len(dialogue)

2969

In [165]:
# drop characters from movies with no imdb id -- 
mids_no_imdb_id = ['m605', 'm457', 'm430', 'm310', 'm488']
# dialogue_no_missing_ids = dialogue.loc[mids_no_imdb_id.count(dialogue['mid']) == 0]
dialogue_no_missing_ids = dialogue[dialogue['mid'] != 'm605']
dialogue_no_missing_ids = dialogue_no_missing_ids[dialogue_no_missing_ids['mid'] != 'm457']
dialogue_no_missing_ids = dialogue_no_missing_ids[dialogue_no_missing_ids['mid'] != 'm430']
dialogue_no_missing_ids = dialogue_no_missing_ids[dialogue_no_missing_ids['mid'] != 'm310']
dialogue_no_missing_ids = dialogue_no_missing_ids[dialogue_no_missing_ids['mid'] != 'm488']

print(len(dialogue_no_missing_ids)) # 2953
dialogue_no_missing_ids.nunique() # should be 595 unique mids

2953


mid           595
cid          2953
cname        1919
mname         595
gender          4
wordcount    1424
year           74
genres        280
comedy          2
thriller        2
drama           2
romance         2
lines        2953
dtype: int64

In [166]:
dialogue_no_missing_ids = dialogue_no_missing_ids.reset_index()

In [167]:
dialogue_no_missing_ids.gender.unique()

array(['f', 'm', 'M', 'F'], dtype=object)

In [168]:
# make all gender labels lower case
dialogue_no_missing_ids.gender = dialogue_no_missing_ids.gender.str.lower()

In [169]:
dialogue_no_missing_ids.head(5)

Unnamed: 0,index,mid,cid,cname,mname,gender,wordcount,year,genres,comedy,thriller,drama,romance,lines
0,0,m0,u0,BIANCA,10 things i hate about you,f,959,1999,"['comedy', 'romance']",True,False,False,True,They do not! / I hope so. / Let's go. / Okay -...
1,1,m0,u2,CAMERON,10 things i hate about you,m,527,1999,"['comedy', 'romance']",True,False,False,True,"They do to! / She okay? / Wow / No / The ""real..."
2,2,m0,u4,JOEY,10 things i hate about you,m,278,1999,"['comedy', 'romance']",True,False,False,True,"Listen, I want to talk to you about the prom. ..."
3,3,m0,u5,KAT,10 things i hate about you,f,1217,1999,"['comedy', 'romance']",True,False,False,True,Perm? / It's just you. / What? To completely d...
4,4,m0,u6,MANDELLA,10 things i hate about you,f,157,1999,"['comedy', 'romance']",True,False,False,True,William - he asked me to meet him here. / Have...


In [170]:
# add cols for writer data
dialogue_no_missing_ids['num_of_female_writers'] = ""
dialogue_no_missing_ids['total_writers'] = ""

In [171]:
# map movie id for each char to num of female writers and total num of writers 
for i, row in dialogue_no_missing_ids.iterrows():
    mid = dialogue_no_missing_ids.at[i, 'mid']
    match = genderdf[genderdf['mid'] == mid]
    f_num = match.iloc[0, 6] # grabs the num_of_female_writers col value
    total = match.iloc[0, 7] # grabs the total_num_of_writers col value
    dialogue_no_missing_ids.at[i, 'num_of_female_writers'] = f_num
    dialogue_no_missing_ids.at[i, 'total_writers'] = total

In [172]:
# create doc term matrix for each char dialogue
vectorizer = CountVectorizer(max_features = 5000, stop_words = ['the', 'of', 'and'])

In [173]:
sparse_wordcounts = vectorizer.fit_transform(dialogue_no_missing_ids.lines)
wordcounts = sparse_wordcounts.toarray()
dialoguewords = pd.DataFrame(wordcounts, columns = vectorizer.get_feature_names())
dialoguewords.head()
len(dialoguewords)

2953

In [174]:
dialoguewords

Unnamed: 0,00,000,10,100,11,12,14,15,17,18,20,200,22,24,25,30,40,45,50,500,60,80,99,abandon,abandoned,ability,able,aboard,abortion,about,above,absolute,absolutely,absurd,abuse,academy,accept,accepted,access,accident,according,account,accounts,accurate,ace,acid,across,act,acted,acting,...,wound,wounded,wounds,wow,wrap,wrapped,wreck,write,writer,writes,writing,written,wrong,wrote,wyatt,wynant,ya,yacht,yah,yankee,yard,yards,ye,yeah,year,years,yeh,yell,yelling,yellow,yep,yes,yessir,yesterday,yet,yo,york,you,young,younger,your,yours,yourself,yourselves,youth,yuh,yup,zero,zone,zoo
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,50,0,0,4,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,24,0,0,3,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,16,0,0,2,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,69,0,0,8,0,3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2948,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,17,0,0,2,1,0,0,0,0,0,0,0,0
2949,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,3,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3,0,0,0,0,0,32,0,0,6,0,0,0,0,0,0,0,0,0
2950,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,7,0,0,0,0,0,4,0,0,1,0,0,55,0,0,8,0,0,0,0,0,0,0,0,0
2951,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,0,1,0,0,0,0,0,5,0,0,0,0,0,53,0,0,10,0,0,0,0,0,0,0,0,0


In [175]:
# calculate word freq from word counts and factor out dialogue length
dwordcount = dialogue_no_missing_ids['wordcount']

In [176]:
# gives us relative word freq for each char
wordfreqs = dialoguewords.divide(dwordcount, axis = 'rows')
len(wordfreqs)

2953

In [177]:
wordfreqs['#dwordcount'] = dwordcount
wordfreqs.head(20)

Unnamed: 0,00,000,10,100,11,12,14,15,17,18,20,200,22,24,25,30,40,45,50,500,60,80,99,abandon,abandoned,ability,able,aboard,abortion,about,above,absolute,absolutely,absurd,abuse,academy,accept,accepted,access,accident,according,account,accounts,accurate,ace,acid,across,act,acted,acting,...,wounded,wounds,wow,wrap,wrapped,wreck,write,writer,writes,writing,written,wrong,wrote,wyatt,wynant,ya,yacht,yah,yankee,yard,yards,ye,yeah,year,years,yeh,yell,yelling,yellow,yep,yes,yessir,yesterday,yet,yo,york,you,young,younger,your,yours,yourself,yourselves,youth,yuh,yup,zero,zone,zoo,#dwordcount
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001043,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052138,0.0,0.0,0.004171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,959
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.001898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045541,0.0,0.0,0.005693,0.0,0.001898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,527
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003597,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003597,0.0,0.0,0.0,0.0,0.0,0.0,0.007194,0.003597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057554,0.0,0.0,0.007194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,278
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000822,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002465,0.000822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.056697,0.0,0.0,0.006574,0.0,0.002465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1217
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012739,0.006369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,157
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050955,0.0,0.0,0.007962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,628
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003895,0.000974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000974,0.0,0.0,0.0,0.0,0.0,0.0,0.003895,0.001947,0.0,0.0,0.0,0.0,0.0,0.0,0.000974,0.0,0.0,0.0,0.0,0.0,0.064265,0.0,0.0,0.009737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1027
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002732,0.0,0.0,0.008197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002732,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065574,0.0,0.0,0.013661,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,366
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000789,0.0,0.003155,0.0,0.0,0.0,0.0,0.0,0.0,0.001577,0.0,0.0,0.0,0.001577,0.0,0.0,0.0,0.0,0.0,0.0,0.000789,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001577,0.0,0.0,0.0,0.0,0.0,0.005521,0.0,0.0,0.000789,0.0,0.0,0.039432,0.0,0.0,0.009464,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1268
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101942,0.0,0.0,0.009709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,206


In [178]:
# shuffle chars -- in order to draw random sample from
wordfreqs = wordfreqs.sample(frac = 1)
wordfreqs.head()

Unnamed: 0,00,000,10,100,11,12,14,15,17,18,20,200,22,24,25,30,40,45,50,500,60,80,99,abandon,abandoned,ability,able,aboard,abortion,about,above,absolute,absolutely,absurd,abuse,academy,accept,accepted,access,accident,according,account,accounts,accurate,ace,acid,across,act,acted,acting,...,wounded,wounds,wow,wrap,wrapped,wreck,write,writer,writes,writing,written,wrong,wrote,wyatt,wynant,ya,yacht,yah,yankee,yard,yards,ye,yeah,year,years,yeh,yell,yelling,yellow,yep,yes,yessir,yesterday,yet,yo,york,you,young,younger,your,yours,yourself,yourselves,youth,yuh,yup,zero,zone,zoo,#dwordcount
999,0.0,0.0,0.0,0.001767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001767,0.0,0.0,0.001767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001767,0.0,0.0,0.0,0.0,0.0,0.0,0.001767,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001767,0.0,0.0,0.001767,0.0,0.0,0.047703,0.001767,0.0,0.012367,0.0,0.003534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,566
337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002217,0.0,0.0,0.0,0.0,0.0,0.022173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,451
1148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001139,0.003417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001139,0.0,0.0,0.0,0.035308,0.0,0.0,0.004556,0.0,0.001139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,878
1202,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.003185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003185,0.0,0.0,0.003185,0.0,0.0,0.076433,0.0,0.0,0.009554,0.0,0.003185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,314
137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002404,0.0,0.0,0.0,0.038462,0.0,0.0,0.002404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,416


In [179]:
# so we can map from the tweet in the word matrix to the tweet in the original tweets df
reorderedchars = dialogue_no_missing_ids.loc[wordfreqs.index, : ]
reorderedchars.head()

Unnamed: 0,index,mid,cid,cname,mname,gender,wordcount,year,genres,comedy,thriller,drama,romance,lines,num_of_female_writers,total_writers
999,999,m28,u456,CAPTAIN GALGENSTEIN,barry lyndon,m,566,1975,"['drama', 'romance', 'war']",False,False,True,True,Oh! That will be sent after him. I have a fa...,0,2
337,337,m158,u2460,ELLIE,pet sematary,f,451,1989,"['drama', 'fantasy', 'horror', 'mystery', 'thr...",False,True,True,False,"He can if He wants to. He can do anything, jus...",0,1
1148,1148,m304,u4601,DRUMLIN,contact,m,878,1997,"['drama', 'mystery', 'sci-fi', 'thriller']",False,True,True,False,-- It's extremely unlikely that they had any i...,1,4
1202,1205,m315,u4746,SCHREBER,dark city,m,314,1998,"['mystery', 'sci-fi', 'thriller']",False,True,False,False,I'm afraid so. / You wouldn't believe me if I ...,0,3
137,137,m123,u1868,ALICE,lost highway,f,416,1997,"['drama', 'horror', 'mystery', 'thriller']",False,True,True,False,Look at all this shit... I know a fence... he'...,0,2


In [180]:
# separate test and training data -- about 70 / 30 split

# test set
testfreqs = wordfreqs.iloc[0: 1476, : ]
test_y = (reorderedchars['gender'][0: 1476] == 'm').astype(int)  
test_y[0:10]

999     1
337     0
1148    1
1202    1
137     0
2117    0
1946    1
1445    0
2365    1
1476    1
Name: gender, dtype: int64

In [181]:
# all remaining rows as the training set  
trainfreqs = wordfreqs.iloc[1476 : , : ]
train_y = (reorderedchars['gender'][1476 : ] == 'm').astype(int) 
train_y[0:10]

2002    1
2838    1
254     1
48      1
1210    0
2587    0
2636    1
1014    1
670     1
616     0
Name: gender, dtype: int64

In [182]:
# scale X matrices for training data
# normalizes cols -- col = (col-col.mean) / col.std (like a z score)
trainscaler = StandardScaler()
trainXscaled = trainscaler.fit_transform(trainfreqs)
trainXscaled = pd.DataFrame(trainXscaled, columns = trainfreqs.columns)
trainXscaled.head()

Unnamed: 0,00,000,10,100,11,12,14,15,17,18,20,200,22,24,25,30,40,45,50,500,60,80,99,abandon,abandoned,ability,able,aboard,abortion,about,above,absolute,absolutely,absurd,abuse,academy,accept,accepted,access,accident,according,account,accounts,accurate,ace,acid,across,act,acted,acting,...,wounded,wounds,wow,wrap,wrapped,wreck,write,writer,writes,writing,written,wrong,wrote,wyatt,wynant,ya,yacht,yah,yankee,yard,yards,ye,yeah,year,years,yeh,yell,yelling,yellow,yep,yes,yessir,yesterday,yet,yo,york,you,young,younger,your,yours,yourself,yourselves,youth,yuh,yup,zero,zone,zoo,#dwordcount
0,-0.067186,-0.145051,-0.107521,-0.115804,-0.08866,-0.100633,-0.070565,-0.085162,-0.071346,-0.077905,-0.103954,-0.083975,-0.065362,-0.083657,-0.067803,-0.121914,-0.070995,-0.071034,-0.084484,-0.069119,-0.071037,-0.074425,-0.048546,-0.07169,-0.063665,-0.096765,-0.282432,-0.107817,-0.065581,-1.181957,-0.148609,-0.08537,-0.215782,-0.075351,22.866284,-0.08222,-0.159933,-0.076319,-0.070463,-0.206491,-0.129747,-0.140887,-0.0809,-0.087949,-0.049009,-0.060196,-0.200943,-0.266918,-0.072474,-0.116308,...,-0.064489,-0.066821,-0.127321,-0.078419,-0.085432,-0.087264,-0.2481,-0.125052,-0.088688,-0.141629,-0.134221,-0.46429,-0.183826,-0.044303,-0.036821,-0.177494,-0.042703,-0.044925,-0.067886,-0.10797,-0.097354,-0.048243,4.050938,-0.331438,-0.529053,-0.067749,-0.070906,-0.069311,-0.08271,-0.116485,-0.548912,-0.046767,-0.210956,-0.358032,-0.093586,-0.191631,-1.105324,-0.258056,-0.093391,-1.118794,-0.320188,-0.4587,-0.077016,-0.09241,-0.042415,-0.081721,-0.100903,-0.060288,-0.082563,-0.797045
1,-0.067186,-0.145051,-0.107521,-0.115804,-0.08866,-0.100633,-0.070565,-0.085162,-0.071346,-0.077905,-0.103954,-0.083975,-0.065362,-0.083657,-0.067803,-0.121914,-0.070995,-0.071034,-0.084484,-0.069119,-0.071037,-0.074425,-0.048546,-0.07169,-0.063665,-0.096765,-0.282432,-0.107817,-0.065581,-1.181957,-0.148609,-0.08537,-0.215782,-0.075351,-0.07461,-0.08222,-0.159933,-0.076319,-0.070463,-0.206491,-0.129747,-0.140887,-0.0809,-0.087949,-0.049009,-0.060196,-0.200943,-0.266918,-0.072474,-0.116308,...,-0.064489,-0.066821,-0.127321,-0.078419,-0.085432,-0.087264,-0.2481,-0.125052,-0.088688,-0.141629,-0.134221,-0.46429,-0.183826,-0.044303,-0.036821,-0.177494,-0.042703,-0.044925,-0.067886,-0.10797,-0.097354,-0.048243,-0.5424,-0.331438,-0.529053,-0.067749,-0.070906,-0.069311,-0.08271,-0.116485,-0.548912,-0.046767,-0.210956,-0.358032,-0.093586,-0.191631,1.02279,-0.258056,-0.093391,0.284505,-0.320188,-0.4587,-0.077016,-0.09241,-0.042415,-0.081721,-0.100903,-0.060288,-0.082563,-0.914995
2,-0.067186,-0.145051,-0.107521,-0.115804,-0.08866,-0.100633,-0.070565,-0.085162,-0.071346,18.419323,-0.103954,-0.083975,-0.065362,-0.083657,-0.067803,-0.121914,-0.070995,-0.071034,-0.084484,-0.069119,-0.071037,-0.074425,-0.048546,-0.07169,-0.063665,-0.096765,-0.282432,-0.107817,-0.065581,0.562753,-0.148609,-0.08537,-0.215782,-0.075351,-0.07461,-0.08222,-0.159933,-0.076319,-0.070463,-0.206491,-0.129747,-0.140887,-0.0809,-0.087949,-0.049009,-0.060196,-0.200943,-0.266918,-0.072474,-0.116308,...,-0.064489,-0.066821,-0.127321,-0.078419,-0.085432,-0.087264,-0.2481,-0.125052,-0.088688,-0.141629,-0.134221,1.945017,-0.183826,-0.044303,-0.036821,-0.177494,-0.042703,-0.044925,-0.067886,-0.10797,-0.097354,-0.048243,-0.5424,-0.331438,1.763314,-0.067749,-0.070906,-0.069311,-0.08271,-0.116485,-0.548912,-0.046767,-0.210956,-0.358032,19.122446,-0.191631,-0.868802,-0.258056,-0.093391,2.656857,-0.320188,-0.4587,-0.077016,-0.09241,-0.042415,-0.081721,-0.100903,-0.060288,-0.082563,-0.680337
3,-0.067186,-0.145051,-0.107521,-0.115804,-0.08866,-0.100633,-0.070565,-0.085162,-0.071346,-0.077905,-0.103954,-0.083975,-0.065362,-0.083657,-0.067803,-0.121914,-0.070995,-0.071034,-0.084484,-0.069119,-0.071037,-0.074425,-0.048546,-0.07169,-0.063665,-0.096765,-0.282432,-0.107817,-0.065581,-1.181957,-0.148609,-0.08537,-0.215782,-0.075351,-0.07461,-0.08222,-0.159933,-0.076319,-0.070463,-0.206491,-0.129747,-0.140887,-0.0809,-0.087949,-0.049009,-0.060196,-0.200943,-0.266918,-0.072474,-0.116308,...,-0.064489,-0.066821,-0.127321,-0.078419,-0.085432,-0.087264,-0.2481,-0.125052,-0.088688,-0.141629,-0.134221,-0.46429,-0.183826,-0.044303,-0.036821,1.547816,-0.042703,-0.044925,-0.067886,-0.10797,-0.097354,-0.048243,-0.5424,-0.331438,2.790554,-0.067749,-0.070906,-0.069311,-0.08271,-0.116485,-0.548912,-0.046767,-0.210956,-0.358032,-0.093586,-0.191631,0.168388,-0.258056,-0.093391,-1.118794,-0.320188,-0.4587,-0.077016,-0.09241,-0.042415,-0.081721,-0.100903,-0.060288,-0.082563,-0.798286
4,-0.067186,-0.145051,-0.107521,-0.115804,-0.08866,-0.100633,-0.070565,-0.085162,-0.071346,-0.077905,-0.103954,-0.083975,-0.065362,-0.083657,-0.067803,-0.121914,-0.070995,-0.071034,-0.084484,-0.069119,-0.071037,-0.074425,-0.048546,-0.07169,-0.063665,-0.096765,-0.282432,-0.107817,-0.065581,-1.181957,-0.148609,-0.08537,-0.215782,-0.075351,-0.07461,-0.08222,-0.159933,-0.076319,-0.070463,-0.206491,-0.129747,-0.140887,-0.0809,-0.087949,-0.049009,-0.060196,-0.200943,-0.266918,-0.072474,-0.116308,...,-0.064489,-0.066821,-0.127321,-0.078419,-0.085432,-0.087264,-0.2481,-0.125052,-0.088688,-0.141629,-0.134221,-0.46429,-0.183826,-0.044303,-0.036821,-0.177494,-0.042703,-0.044925,-0.067886,-0.10797,-0.097354,-0.048243,-0.5424,-0.331438,1.565461,-0.067749,-0.070906,-0.069311,-0.08271,-0.116485,-0.548912,-0.046767,-0.210956,2.488111,-0.093586,-0.191631,-2.298421,-0.258056,-0.093391,-0.625968,-0.320188,-0.4587,-0.077016,-0.09241,-0.042415,-0.081721,-0.100903,-0.060288,-0.082563,-0.644331


In [183]:
# scale X matrices for testing data
testscaler = StandardScaler()
testXscaled = testscaler.fit_transform(testfreqs)
testXscaled = pd.DataFrame(testXscaled, columns = testfreqs.columns)

In [184]:
# simple model
# train model
logist = LogisticRegression(C = .1, max_iter = 1000) 
logist.fit(trainXscaled, train_y)

# test model
predictions = logist.predict(testXscaled)
sum(predictions == test_y) / len(predictions)

0.7222222222222222

In [185]:
# more accurate refelction of performance of model
f1_score(test_y, predictions)

0.8082319925163705

In [186]:
# balance of classes -- m vs. f
dialogue_no_missing_ids.gender.value_counts()

m    2011
f     942
Name: gender, dtype: int64

In [187]:
# handling imbalanced classes
# weight classes inversely to frequency -- female char error counts more

# train
logist = LogisticRegression(C = .1, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)

# test
predictions = logist.predict(testXscaled)
f1_score(test_y, predictions)

0.7982750359367513

In [188]:
# cross validation -- select best C parameter using training data
for c_param in [.0001, .001, .01, .1, 1, 10, 100, 1000]:
    logist = LogisticRegression(C = c_param, max_iter = 1000, class_weight = 'balanced') 
    results = cross_validate(logist, trainXscaled, train_y, cv = 5, scoring = 'f1')
    print('C parameter:', c_param)
    print('Mean f1:', np.mean(results['test_score']))
    print()

C parameter: 0.0001
Mean f1: 0.8166109139759941

C parameter: 0.001
Mean f1: 0.8148892257121627

C parameter: 0.01
Mean f1: 0.8192397871428533

C parameter: 0.1
Mean f1: 0.815443299246926

C parameter: 1
Mean f1: 0.8106603892036659

C parameter: 10
Mean f1: 0.8114512832159209

C parameter: 100
Mean f1: 0.8119875181779944

C parameter: 1000
Mean f1: 0.8078501378962833



In [189]:
# use best C param to train whole training set
# train
logist = LogisticRegression(C = .01, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)

# test
predictions = logist.predict(testXscaled)
accuracy = f1_score(test_y, predictions)
print("Accuracy for entire training set is: ", accuracy)

Accuracy for entire training set is:  0.8038461538461539


In [190]:
# need to split test data (first 2000 rows) -- into groups by writer gender (at least 1 female, no female writers)

reorderedchars_first2000 = reorderedchars.iloc[0:1476, :] # test data rows

reorderedchars_first2000_f = reorderedchars_first2000[reorderedchars_first2000['num_of_female_writers'] >= 1]

reorderedchars_first2000_no_f = reorderedchars_first2000[reorderedchars_first2000['num_of_female_writers'] == 0]
reorderedchars_first2000_f

Unnamed: 0,index,mid,cid,cname,mname,gender,wordcount,year,genres,comedy,thriller,drama,romance,lines,num_of_female_writers,total_writers
1148,1148,m304,u4601,DRUMLIN,contact,m,878,1997,"['drama', 'mystery', 'sci-fi', 'thriller']",False,True,True,False,-- It's extremely unlikely that they had any i...,1,4
1932,1937,m451,u6770,SHELLY,my girl,f,1274,1991,"['comedy', 'drama', 'family', 'romance']",True,False,True,True,"Ex...ex, ex... / This is Harry, Phil, Grammoo ...",1,1
1151,1151,m304,u4605,JOSS,contact,m,1082,1997,"['drama', 'mystery', 'sci-fi', 'thriller']",False,True,True,False,Meaning... / Does that mean you think it doesn...,1,4
2454,2462,m554,u8162,D'AGOSTA,the relic,m,696,1997,"['horror', 'mystery', 'thriller']",False,True,False,False,There's some kind of animal loose in the museu...,2,6
1628,1631,m399,u6010,ARMAND,interview with the vampire: the vampire chroni...,m,569,1994,"['drama', 'fantasy']",False,False,True,False,"We are a pair, and that's what counts. / Your ...",1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1064,1064,m290,u4383,LANE,catwoman,m,690,2004,"['action', 'crime', 'fantasy']",False,False,False,False,What little boy doesn't? / But then we knew th...,1,5
2363,2371,m538,u7952,LUCY,sugar & spice,f,266,2001,"['comedy', 'crime']",True,False,False,False,Better enjoy it now. 'Cause tomorrow some lit...,1,1
1316,1319,m336,u5074,ED,erin brockovich,m,1909,2000,"['biography', 'drama', 'romance']",False,False,True,True,"Tell you what, why don't you go on over to rec...",1,1
1527,1530,m379,u5722,ANNIE,halloween,f,505,1978,"['horror', 'thriller']",False,True,False,False,"He shouts, too. / Huh? / You blame everything ...",1,3


In [191]:
# test accuracy on chars from movies w/ at least 1 female writer

# separate out test set rows w/ at least 1 female writer
testfreqs_female = wordfreqs.loc[reorderedchars_first2000_f.index, :]
test_y_f = (reorderedchars_first2000_f['gender'][:] == 'm').astype(int)

# scale test data w/ atleast 1 female writer
testscaler = StandardScaler()
testXscaled_f = testscaler.fit_transform(testfreqs_female)
testXscaled_f = pd.DataFrame(testXscaled_f, columns = testfreqs_female.columns)

In [192]:
# train
logist = LogisticRegression(C = .01, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)

# test w/ testXscaled_f
predictions = logist.predict(testXscaled_f)
accuracy = f1_score(test_y_f, predictions)
print("Accuracy for female writers is: ", accuracy)

Accuracy for female writers is:  0.7290969899665551


In [193]:
# test accuracy on chars from movies w/ no female writers

# separate out test set rows w/ at least 1 female writer
testfreqs_no_female = wordfreqs.loc[reorderedchars_first2000_no_f.index, :]
test_y_no_f = (reorderedchars_first2000_no_f['gender'][:] == 'm').astype(int)

# scale test data w/ atleast 1 female writer
testscaler = StandardScaler()
testXscaled_no_f = testscaler.fit_transform(testfreqs_no_female)
testXscaled_no_f = pd.DataFrame(testfreqs_no_female, columns = testfreqs_no_female.columns)

In [194]:
# train
logist = LogisticRegression(C = .01, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)

# test w/ testXscaled_f
predictions = logist.predict(testXscaled_no_f)
accuracy = f1_score(test_y_no_f, predictions)
print("Accuracy for no female writers is: ", accuracy)

Accuracy for no female writers is:  0.8287864534336783


In [195]:
# classifier model predictive features
logist = LogisticRegression(C = .01, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)
coefficients = [x for x in zip(logist.coef_[0], vectorizer.get_feature_names())]

In [196]:
coefficients.sort()

In [199]:
coefficients[0:25] # female

[(-0.1162240776320299, 'he'),
 (-0.10188375101684477, 'oh'),
 (-0.09513809813696171, 'silly'),
 (-0.08173087801310798, 'jus'),
 (-0.0790178799057757, 'wonderful'),
 (-0.07627812111060284, 'so'),
 (-0.07515343676926366, 'peter'),
 (-0.07295995619597638, 'answering'),
 (-0.07225278635620691, 'god'),
 (-0.06642558050346796, 'never'),
 (-0.06343065708824705, 'are'),
 (-0.06200063302595427, 'mmm'),
 (-0.06117247822164393, 'him'),
 (-0.06081040838952122, 'bruce'),
 (-0.06077450073779504, 'love'),
 (-0.06055047147943528, 'pregnant'),
 (-0.06036293357376934, 'darling'),
 (-0.06020661490593351, 'someone'),
 (-0.05949973382469136, 'cute'),
 (-0.05943628794360858, 'bishop'),
 (-0.059063892380907486, 'because'),
 (-0.05825666245193036, 'simone'),
 (-0.05796282993856054, 'said'),
 (-0.05769109285970823, 'pg'),
 (-0.057425320237477054, 'you')]

In [200]:
coefficients[-25:] # male

[(0.05179489184334232, 'attacks'),
 (0.052642216281434215, 'running'),
 (0.05286478701053098, 'listen'),
 (0.05288771414238553, 'minor'),
 (0.052996652798326, 'shake'),
 (0.05327006331853449, 'honor'),
 (0.053698559279291264, 'buildings'),
 (0.05518711016011466, 'lost'),
 (0.05564814499208171, 'wife'),
 (0.05574404858636495, 'fought'),
 (0.05645008020752846, 'drives'),
 (0.0565081229837025, 'uh'),
 (0.056702661685025794, 'machine'),
 (0.057093239462933855, 'opened'),
 (0.05752292264751577, 'shit'),
 (0.0580003230442742, 'pro'),
 (0.05877266706369764, 'fuckin'),
 (0.05959931003120983, 'important'),
 (0.05976739543691951, 'permit'),
 (0.06483317161207322, 'hey'),
 (0.06606662135170593, 'her'),
 (0.06814535192139708, 'understand'),
 (0.07181754352548028, 'she'),
 (0.0723476009213711, 'hell'),
 (0.07990659224648222, 'man')]

In [206]:
female_writer_words = dialoguewords.loc[dialogue_no_missing_ids.num_of_female_writers >= 1, :]
no_female_writer_words = dialoguewords.loc[dialogue_no_missing_ids.num_of_female_writers == 0, :]

Unnamed: 0,00,000,10,100,11,12,14,15,17,18,20,200,22,24,25,30,40,45,50,500,60,80,99,abandon,abandoned,ability,able,aboard,abortion,about,above,absolute,absolutely,absurd,abuse,academy,accept,accepted,access,accident,according,account,accounts,accurate,ace,acid,across,act,acted,acting,...,wound,wounded,wounds,wow,wrap,wrapped,wreck,write,writer,writes,writing,written,wrong,wrote,wyatt,wynant,ya,yacht,yah,yankee,yard,yards,ye,yeah,year,years,yeh,yell,yelling,yellow,yep,yes,yessir,yesterday,yet,yo,york,you,young,younger,your,yours,yourself,yourselves,youth,yuh,yup,zero,zone,zoo
10,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,7,0,0,2,0,0,0,0,0,0,0,0,0
11,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,4,0,0,0,0,0,0,1,0,0,0,1,0,0,21,0,0,1,0,1,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,4,0,0,0,0,0,16,0,0,0,1,0,0,0,0,0,0,0,0
13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,32,0,0,4,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,29,0,0,2,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2946,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,15,0,0,5,0,0,0,0,0,0,0,0,0
2947,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,15,0,0,5,0,1,0,1,0,0,0,0,0
2948,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,17,0,0,2,1,0,0,0,0,0,0,0,0
2949,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,3,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3,0,0,0,0,0,32,0,0,6,0,0,0,0,0,0,0,0,0


In [204]:
# dunnings function
def get_dunnings(word, series1, series2):
    observed = pd.DataFrame({'series1': [series1[word], sum(series1) - series1[word]],
                          'series2': [series2[word], sum(series2) - series2[word]]},
                        index = ['word', 'all_others'])
    total_words = observed.to_numpy().sum()
    observed['word_totals'] = observed.sum(axis = 1)
    observed = observed.append(observed.sum(axis = 0).rename(index = 'group_totals'))
    observed.iat[2,2] = 0
    observed['word_totals'] = observed['word_totals'] / sum(observed['word_totals'])
    observed.loc['group_totals', : ] = observed.loc['group_totals', : ] / sum(observed.loc['group_totals', : ])
    expected = np.outer(observed['word_totals'][0:2], observed.loc['group_totals', : ][0:2])
    expected = pd.DataFrame(expected, index = ['word', 'all_others'], columns = ['lessbiased', 'malebiased'])
    expected = expected * total_words
    
    G = 0
    for i in range(2):
        for j in range(2):
            O = observed.iat[i, j] + .000001
            E = expected.iat[i, j] + .000001
            G = G + O * math.log(O / E)
    
    if (observed.iat[0, 0] / sum(observed.iloc[0: 2, 0])) < (observed.iat[0, 1] / sum(observed.iloc[0 : 2, 1])):
        G = -G    # we provide a signed version of the statistic to distinguish
                  # overrepresentation in the two categories
    
    return 2 * G

In [209]:
features = vectorizer.get_feature_names()

In [210]:
# use dunning's to get the words most likely to predict male / female character for female writer's
female_writer_f_chars = female_writer_words.loc[dialogue_no_missing_ids.gender == 'f', :].sum(axis = 'rows')
female_writer_m_chars = female_writer_words.loc[dialogue_no_missing_ids.gender == 'm', :].sum(axis = 'rows')

In [212]:
dunnings_fw = []
for word in features:
    dunnings_fw.append(get_dunnings(word, female_writer_f_chars, female_writer_m_chars))

dunnings = pd.Series(data = dunnings_fw, index = features)

In [213]:
# top 25 for female chars
dunnings.sort_values()[-25: ]

annabelle    22.078840
darryl       22.526079
ben          22.868194
school       23.137794
james        23.274833
mean         23.289686
eve          23.396890
peter        23.997343
home         25.144157
larry        25.178280
brad         26.358894
louise       26.634343
patrick      27.497478
channing     28.563641
he           38.318581
so           38.732261
please       39.271451
really       39.367556
god          40.665684
daddy        46.555678
lombard      50.515589
caesar       53.341002
thelma       55.158229
mitch        69.038046
oh           99.472156
dtype: float64

In [214]:
dunnings.sort_values()[:25 ]

mary         -45.681366
marge        -36.525468
ellie        -32.387360
man          -32.089982
on           -29.013287
baron        -27.567974
claudia      -27.060131
terrance     -27.060131
fucking      -26.831529
ellen        -25.932570
casablanca   -25.027621
stone        -24.836340
phillip      -23.677464
yeah         -23.495619
fuck         -21.569190
moraes       -20.346498
manchester   -19.302301
lois         -18.261736
chrissake    -18.039779
laszlo       -17.698935
hell         -17.376275
attorney     -17.225131
corporate    -16.912256
wendy        -16.192865
pal          -16.192865
dtype: float64

In [None]:
# use dunning's to get the words most likely to predict male / female character for no female writer's
no_female_writer_f_chars = no_female_writer_words.loc[dialogue_no_missing_ids.gender == 'f', :].sum(axis = 'rows')
no_female_writer_m_chars = no_female_writer_words.loc[dialogue_no_missing_ids.gender == 'm', :].sum(axis = 'rows')