In [176]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from pathlib import Path
import math

In [177]:
pd.set_option("display.max_rows", 150, "display.max_columns", 100)

In [178]:
# dialogue data for each character
dialogpath = Path('../project_data/movie_dialogue.tsv')
dialogue = pd.read_csv(dialogpath, sep = '\t')

dialogue.nunique()

mid           600
cid          2969
cname        1925
mname         600
gender          4
wordcount    1428
year           74
genres        283
comedy          2
thriller        2
drama           2
romance         2
lines        2969
dtype: int64

In [179]:
len(dialogue)

2969

In [180]:
# movie writer gender data
genderpath = Path('../data_processing_code/dialogue_writers_gender_with_counts.csv')
genderdf = pd.read_csv(genderpath, encoding = 'utf-8')
genderdf

Unnamed: 0.1,Unnamed: 0,mid,imdb_id,writer_ids,writer_names,writer_gender,num_of_female_writers,total_num_of_writers,all_female_writers
0,0,m0,tt0147800,"['nm0527581', 'nm0809006', 'nm0000636']","['Karen McCullah', 'Kirsten Smith', 'William S...","['F', 'F', 'M']",2,3,False
1,1,m1,tt0103594,['nm0097785'],['Rose Bosch'],['F'],1,1,True
2,2,m2,tt0179626,['nm0381273'],['John Herzfeld'],['M'],0,1,False
3,3,m3,tt0062622,"['nm0000040', 'nm0002009']","['Stanley Kubrick', 'Arthur C. Clarke']","['M', 'M']",0,2,False
4,4,m4,tt0083511,"['nm0006854', 'nm0001353', 'nm0343419', 'nm021...","['Roger Spottiswoode', 'Walter Hill', 'Larry G...","['M', 'M', 'M', 'M']",0,4,False
...,...,...,...,...,...,...,...,...,...
589,589,m610,tt0032138,"['nm0486538', 'nm0753249', 'nm0941138', 'nm000...","['Noel Langley', 'Florence Ryerson', 'Edgar Al...","['M', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'M', ...",1,19,False
590,590,m611,tt0143145,"['nm0701031', 'nm0905498', 'nm0270761', 'nm000...","['Neal Purvis', 'Robert Wade', 'Bruce Feirstei...","['M', 'M', 'M', 'M']",0,4,False
591,591,m612,tt0409459,"['nm1733301', 'nm0371684', 'nm0874844']","['Dave Gibbons', 'David Hayter', 'Alex Tse']","['M', 'M', 'M']",0,3,False
592,592,m613,tt0295701,['nm0929186'],['Rich Wilkes'],['M'],0,1,False


In [181]:
# drop characters from movies with no imdb id / writer info
mids_no_imdb_id = ['m449', 'm310', 'm457', 'm488', 'm430', 'm605']


dialogue_no_missing_ids = dialogue[dialogue['mid'] != 'm449']
dialogue_no_missing_ids = dialogue_no_missing_ids[dialogue_no_missing_ids['mid'] != 'm310']
dialogue_no_missing_ids = dialogue_no_missing_ids[dialogue_no_missing_ids['mid'] != 'm457']
dialogue_no_missing_ids = dialogue_no_missing_ids[dialogue_no_missing_ids['mid'] != 'm488']
dialogue_no_missing_ids = dialogue_no_missing_ids[dialogue_no_missing_ids['mid'] != 'm430']
dialogue_no_missing_ids = dialogue_no_missing_ids[dialogue_no_missing_ids['mid'] != 'm605']

print(len(dialogue_no_missing_ids)) # 2946
dialogue_no_missing_ids.nunique() # should be 594 unique mids

2947


mid           594
cid          2947
cname        1915
mname         594
gender          4
wordcount    1421
year           74
genres        279
comedy          2
thriller        2
drama           2
romance         2
lines        2947
dtype: int64

In [182]:
dialogue_no_missing_ids = dialogue_no_missing_ids.reset_index()

In [183]:
dialogue_no_missing_ids.gender.unique()

array(['f', 'm', 'M', 'F'], dtype=object)

In [184]:
# make all gender labels lower case
dialogue_no_missing_ids.gender = dialogue_no_missing_ids.gender.str.lower()

In [185]:
dialogue_no_missing_ids.head(5)

Unnamed: 0,index,mid,cid,cname,mname,gender,wordcount,year,genres,comedy,thriller,drama,romance,lines
0,0,m0,u0,BIANCA,10 things i hate about you,f,959,1999,"['comedy', 'romance']",True,False,False,True,They do not! / I hope so. / Let's go. / Okay -...
1,1,m0,u2,CAMERON,10 things i hate about you,m,527,1999,"['comedy', 'romance']",True,False,False,True,"They do to! / She okay? / Wow / No / The ""real..."
2,2,m0,u4,JOEY,10 things i hate about you,m,278,1999,"['comedy', 'romance']",True,False,False,True,"Listen, I want to talk to you about the prom. ..."
3,3,m0,u5,KAT,10 things i hate about you,f,1217,1999,"['comedy', 'romance']",True,False,False,True,Perm? / It's just you. / What? To completely d...
4,4,m0,u6,MANDELLA,10 things i hate about you,f,157,1999,"['comedy', 'romance']",True,False,False,True,William - he asked me to meet him here. / Have...


In [186]:
# add cols for writer data -- add writer data to each char
dialogue_no_missing_ids['num_of_female_writers'] = ""
dialogue_no_missing_ids['total_writers'] = ""

In [187]:
# map movie id for each char to num of female writers and total num of writers 
for i, row in dialogue_no_missing_ids.iterrows():
    mid = dialogue_no_missing_ids.at[i, 'mid']
    match = genderdf[genderdf['mid'] == mid]
    f_num = match.iloc[0, 6] # grabs the num_of_female_writers col value
    total = match.iloc[0, 7] # grabs the total_num_of_writers col value
    dialogue_no_missing_ids.at[i, 'num_of_female_writers'] = f_num
    dialogue_no_missing_ids.at[i, 'total_writers'] = total

In [188]:
# create doc term matrix for each char dialogue
vectorizer = CountVectorizer(max_features = 5000, stop_words = ['the', 'of', 'and', 'she', 'her', 'he', 'him'])

In [189]:
sparse_wordcounts = vectorizer.fit_transform(dialogue_no_missing_ids.lines)
wordcounts = sparse_wordcounts.toarray()
dialoguewords = pd.DataFrame(wordcounts, columns = vectorizer.get_feature_names())
dialoguewords.head()
len(dialoguewords)

2947

In [190]:
dialoguewords

Unnamed: 0,00,000,10,100,11,12,14,15,17,18,20,200,22,24,25,30,40,45,50,500,60,80,99,abandon,abandoned,ability,able,aboard,abortion,about,above,absolute,absolutely,absurd,abuse,academy,accept,accepted,access,accident,according,account,accounts,accurate,ace,acid,across,act,acted,acting,...,wouldn,wound,wounded,wounds,wow,wrap,wreck,write,writer,writes,writing,written,wrong,wrote,wyatt,wynant,ya,yacht,yah,yankee,yard,yards,ye,yeah,year,years,yeh,yell,yelling,yellow,yep,yes,yessir,yesterday,yet,yo,york,you,young,younger,your,yours,yourself,yourselves,youth,yuh,yup,zero,zone,zoo
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,50,0,0,4,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,24,0,0,3,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,16,0,0,2,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,69,0,0,8,0,3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,17,0,0,2,1,0,0,0,0,0,0,0,0
2943,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0,0,0,3,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3,0,0,0,0,0,32,0,0,6,0,0,0,0,0,0,0,0,0
2944,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,7,0,0,0,0,0,4,0,0,1,0,0,55,0,0,8,0,0,0,0,0,0,0,0,0
2945,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,0,1,0,0,0,0,0,5,0,0,0,0,0,53,0,0,10,0,0,0,0,0,0,0,0,0


In [191]:
# calculate word freq from word counts and factor out dialogue length
dwordcount = dialogue_no_missing_ids['wordcount']

In [192]:
# gives us relative word freq for each char
wordfreqs = dialoguewords.divide(dwordcount, axis = 'rows')
len(wordfreqs)

2947

In [193]:
wordfreqs['#dwordcount'] = dwordcount
wordfreqs.head(20)

Unnamed: 0,00,000,10,100,11,12,14,15,17,18,20,200,22,24,25,30,40,45,50,500,60,80,99,abandon,abandoned,ability,able,aboard,abortion,about,above,absolute,absolutely,absurd,abuse,academy,accept,accepted,access,accident,according,account,accounts,accurate,ace,acid,across,act,acted,acting,...,wound,wounded,wounds,wow,wrap,wreck,write,writer,writes,writing,written,wrong,wrote,wyatt,wynant,ya,yacht,yah,yankee,yard,yards,ye,yeah,year,years,yeh,yell,yelling,yellow,yep,yes,yessir,yesterday,yet,yo,york,you,young,younger,your,yours,yourself,yourselves,youth,yuh,yup,zero,zone,zoo,#dwordcount
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001043,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052138,0.0,0.0,0.004171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,959
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.001898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045541,0.0,0.0,0.005693,0.0,0.001898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,527
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003597,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003597,0.0,0.0,0.0,0.0,0.0,0.0,0.007194,0.003597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057554,0.0,0.0,0.007194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,278
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000822,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002465,0.000822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.056697,0.0,0.0,0.006574,0.0,0.002465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1217
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012739,0.006369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,157
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050955,0.0,0.0,0.007962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,628
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003895,0.000974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000974,0.0,0.0,0.0,0.0,0.0,0.0,0.003895,0.001947,0.0,0.0,0.0,0.0,0.0,0.0,0.000974,0.0,0.0,0.0,0.0,0.0,0.064265,0.0,0.0,0.009737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1027
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002732,0.0,0.0,0.008197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002732,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065574,0.0,0.0,0.013661,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,366
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000789,0.0,0.003155,0.0,0.0,0.0,0.0,0.0,0.0,0.001577,0.0,0.0,0.0,0.001577,0.0,0.0,0.0,0.0,0.0,0.0,0.000789,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001577,0.0,0.0,0.0,0.0,0.0,0.005521,0.0,0.0,0.000789,0.0,0.0,0.039432,0.0,0.0,0.009464,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1268
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101942,0.0,0.0,0.009709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,206


In [194]:
# shuffle chars -- in order to draw random sample from
wordfreqs = wordfreqs.sample(frac = 1)
wordfreqs.head()

Unnamed: 0,00,000,10,100,11,12,14,15,17,18,20,200,22,24,25,30,40,45,50,500,60,80,99,abandon,abandoned,ability,able,aboard,abortion,about,above,absolute,absolutely,absurd,abuse,academy,accept,accepted,access,accident,according,account,accounts,accurate,ace,acid,across,act,acted,acting,...,wound,wounded,wounds,wow,wrap,wreck,write,writer,writes,writing,written,wrong,wrote,wyatt,wynant,ya,yacht,yah,yankee,yard,yards,ye,yeah,year,years,yeh,yell,yelling,yellow,yep,yes,yessir,yesterday,yet,yo,york,you,young,younger,your,yours,yourself,yourselves,youth,yuh,yup,zero,zone,zoo,#dwordcount
1141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003289,0.0,0.0,0.0,0.0,0.0,0.016447,0.003289,0.0,0.003289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,304
865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001823,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004558,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000912,0.0,0.001823,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000912,0.0,0.0,0.001823,0.001823,0.0,0.0,0.0,0.0,0.0,0.0,0.004558,0.0,0.0,0.0,0.0,0.0,0.050137,0.0,0.0,0.000912,0.0,0.000912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1097
407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001908,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007634,0.0,0.0,0.0,0.0,0.0,0.0,0.000954,0.0,0.001908,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.048664,0.0,0.0,0.006679,0.0,0.000954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1048
1651,0.0,0.000649,0.0,0.000649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000649,0.0,0.0,0.0,0.0,0.00584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000649,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000649,0.0,0.0,0.0,0.0,0.000649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002596,0.000649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000649,0.0,0.000649,0.031798,0.0,0.0,0.005191,0.0,0.000649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1541
455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004532,0.0,0.0,0.0,0.0,0.0,0.057402,0.001511,0.0,0.006042,0.0,0.001511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,662


In [195]:
# so we can map from the tweet in the word matrix to the tweet in the original tweets df
reorderedchars = dialogue_no_missing_ids.loc[wordfreqs.index, : ]
reorderedchars.head()

Unnamed: 0,index,mid,cid,cname,mname,gender,wordcount,year,genres,comedy,thriller,drama,romance,lines,num_of_female_writers,total_writers
1141,1141,m302,u4577,DANIEL,collateral,m,304,2004,"['crime', 'drama', 'thriller']",False,True,True,False,I know the answer. I know all there is to kno...,0,1
865,865,m252,u3819,JAMIE,a walk to remember,f,1097,2002,"['drama', 'romance']",False,False,True,True,Yes. / Landon. I can't even do for myself. / C...,1,3
407,407,m170,u2609,EDDIE,reservoir dogs,m,1048,1992,"['crime', 'mystery', 'thriller']",False,True,False,False,You don't hafta lift shit. You don't really wo...,0,2
1651,1654,m403,u6068,BRODY,jaws 2,m,1541,1978,['thriller'],False,True,False,False,I need a hand here.... / What is it? What's t...,0,3
455,455,m18,u299,MRS. DICKSON,american madness,f,662,1932,['drama'],False,False,True,False,"Why, Matt! / I'm not afraid of you. You haven'...",0,1


In [196]:
# separate test and training data -- about 50 / 50 split

# test set
testfreqs = wordfreqs.iloc[0: 1473, : ]
test_y = (reorderedchars['gender'][0: 1473] == 'm').astype(int)  
test_y[0:10]

1141    1
865     0
407     1
1651    1
455     0
646     1
671     1
2810    1
2677    1
2112    1
Name: gender, dtype: int64

In [197]:
# all remaining rows as the training set  
trainfreqs = wordfreqs.iloc[1473 : , : ]
train_y = (reorderedchars['gender'][1473 : ] == 'm').astype(int) 
train_y[0:10]

1077    1
2054    1
1019    1
2379    1
412     1
1234    1
2549    1
2887    1
1307    0
2664    1
Name: gender, dtype: int64

In [198]:
# scale X matrices for training data
# normalizes cols -- col = (col-col.mean) / col.std (like a z score)
trainscaler = StandardScaler()
trainXscaled = trainscaler.fit_transform(trainfreqs)
trainXscaled = pd.DataFrame(trainXscaled, columns = trainfreqs.columns)
trainXscaled.head()

Unnamed: 0,00,000,10,100,11,12,14,15,17,18,20,200,22,24,25,30,40,45,50,500,60,80,99,abandon,abandoned,ability,able,aboard,abortion,about,above,absolute,absolutely,absurd,abuse,academy,accept,accepted,access,accident,according,account,accounts,accurate,ace,acid,across,act,acted,acting,...,wound,wounded,wounds,wow,wrap,wreck,write,writer,writes,writing,written,wrong,wrote,wyatt,wynant,ya,yacht,yah,yankee,yard,yards,ye,yeah,year,years,yeh,yell,yelling,yellow,yep,yes,yessir,yesterday,yet,yo,york,you,young,younger,your,yours,yourself,yourselves,youth,yuh,yup,zero,zone,zoo,#dwordcount
0,-0.079108,-0.135211,-0.098071,-0.098722,-0.101068,-0.097952,-0.089105,-0.077517,-0.084041,-0.102209,-0.114528,-0.074067,-0.057261,-0.072402,-0.08271,-0.126149,-0.06665,-0.07943,-0.099322,-0.089248,-0.073243,-0.079678,-0.072402,-0.095635,-0.078967,-0.123363,-0.295871,-0.094657,-0.054735,-0.05238,-0.115215,-0.089142,-0.235493,-0.078649,-0.074477,-0.074769,-0.167045,-0.070324,-0.074182,-0.180827,-0.130175,-0.135737,-0.068257,-0.057711,-0.066678,-0.06842,-0.1769,-0.258651,-0.076395,-0.119971,...,-0.111026,-0.071797,-0.063838,-0.132881,-0.080487,-0.08075,-0.263682,-0.121246,-0.08692,-0.162911,4.153461,1.776965,-0.191552,-0.051591,-0.052059,-0.166606,-0.048995,-0.049968,-0.0519,-0.107753,-0.092259,-0.077784,-0.568877,-0.302114,-0.52739,-0.078441,-0.092422,-0.086884,-0.120014,-0.091877,0.168157,-0.031992,-0.217649,-0.405855,-0.081045,-0.182374,-0.052493,-0.269733,-0.083521,-0.425771,-0.293491,-0.468412,-0.086693,-0.088078,-0.027331,-0.097397,-0.096889,-0.085793,-0.090494,-0.223266
1,-0.079108,-0.135211,-0.098071,-0.098722,-0.101068,-0.097952,-0.089105,-0.077517,-0.084041,5.364412,-0.114528,-0.074067,-0.057261,-0.072402,-0.08271,-0.126149,-0.06665,-0.07943,-0.099322,-0.089248,-0.073243,-0.079678,-0.072402,-0.095635,-0.078967,-0.123363,1.251602,-0.094657,-0.054735,-0.421328,-0.115215,-0.089142,-0.235493,-0.078649,-0.074477,-0.074769,2.843477,-0.070324,-0.074182,-0.180827,-0.130175,-0.135737,-0.068257,-0.057711,-0.066678,-0.06842,-0.1769,-0.258651,-0.076395,-0.119971,...,-0.111026,-0.071797,-0.063838,-0.132881,-0.080487,-0.08075,-0.263682,-0.121246,-0.08692,-0.162911,-0.136006,-0.473775,-0.191552,-0.051591,-0.052059,-0.166606,-0.048995,-0.049968,-0.0519,-0.107753,-0.092259,-0.077784,-0.301547,-0.302114,0.159969,-0.078441,-0.092422,-0.086884,-0.120014,-0.091877,-0.346731,-0.031992,-0.217649,-0.405855,-0.081045,-0.182374,-0.181092,-0.269733,-0.083521,0.129809,-0.293491,-0.468412,-0.086693,-0.088078,-0.027331,-0.097397,-0.096889,-0.085793,-0.090494,0.209298
2,-0.079108,-0.135211,-0.098071,-0.098722,-0.101068,-0.097952,-0.089105,-0.077517,-0.084041,-0.102209,-0.114528,-0.074067,-0.057261,-0.072402,-0.08271,-0.126149,-0.06665,-0.07943,-0.099322,-0.089248,-0.073243,-0.079678,-0.072402,-0.095635,-0.078967,-0.123363,-0.295871,-0.094657,-0.054735,-0.562703,1.436222,-0.089142,0.600929,-0.078649,-0.074477,-0.074769,2.756419,-0.070324,-0.074182,-0.180827,-0.130175,2.43735,-0.068257,-0.057711,-0.066678,-0.06842,-0.1769,0.63206,-0.076395,-0.119971,...,-0.111026,-0.071797,-0.063838,-0.132881,-0.080487,-0.08075,-0.263682,-0.121246,-0.08692,-0.162911,-0.136006,0.620626,-0.191552,-0.051591,-0.052059,-0.166606,-0.048995,-0.049968,-0.0519,-0.107753,-0.092259,-0.077784,-0.568877,-0.302114,-0.52739,-0.078441,-0.092422,-0.086884,-0.120014,-0.091877,1.399588,-0.031992,-0.217649,-0.405855,-0.081045,-0.182374,0.15828,-0.269733,-0.083521,-0.445364,0.309084,-0.129785,-0.086693,-0.088078,-0.027331,-0.097397,-0.096889,-0.085793,-0.090494,1.588253
3,-0.079108,-0.135211,-0.098071,-0.098722,-0.101068,-0.097952,-0.089105,-0.077517,-0.084041,-0.102209,-0.114528,-0.074067,-0.057261,-0.072402,-0.08271,-0.126149,-0.06665,-0.07943,-0.099322,-0.089248,-0.073243,-0.079678,-0.072402,-0.095635,-0.078967,-0.123363,-0.295871,-0.094657,-0.054735,-0.699905,-0.115215,-0.089142,-0.235493,-0.078649,-0.074477,-0.074769,-0.167045,-0.070324,-0.074182,-0.180827,-0.130175,-0.135737,-0.068257,-0.057711,-0.066678,-0.06842,-0.1769,-0.258651,-0.076395,-0.119971,...,-0.111026,-0.071797,-0.063838,-0.132881,-0.080487,-0.08075,-0.263682,-0.121246,-0.08692,-0.162911,-0.136006,-0.473775,-0.191552,-0.051591,-0.052059,-0.166606,-0.048995,-0.049968,-0.0519,-0.107753,-0.092259,-0.077784,-0.568877,-0.302114,-0.52739,-0.078441,-0.092422,-0.086884,-0.120014,-0.091877,-0.121676,-0.031992,-0.217649,-0.405855,-0.081045,-0.182374,0.304392,-0.269733,-0.083521,-0.247778,-0.293491,3.448938,-0.086693,-0.088078,-0.027331,-0.097397,-0.096889,26.098094,-0.090494,-0.397042
4,-0.079108,-0.135211,-0.098071,-0.098722,-0.101068,-0.097952,-0.089105,-0.077517,-0.084041,-0.102209,-0.114528,-0.074067,-0.057261,-0.072402,-0.08271,-0.126149,-0.06665,-0.07943,-0.099322,-0.089248,-0.073243,-0.079678,-0.072402,-0.095635,-0.078967,-0.123363,-0.295871,-0.094657,-0.054735,0.786628,-0.115215,-0.089142,-0.235493,-0.078649,-0.074477,-0.074769,-0.167045,-0.070324,-0.074182,-0.180827,-0.130175,-0.135737,-0.068257,-0.057711,-0.066678,-0.06842,-0.1769,0.592746,-0.076395,0.556256,...,-0.111026,-0.071797,-0.063838,-0.132881,-0.080487,-0.08075,-0.263682,-0.121246,-0.08692,-0.162911,-0.136006,-0.473775,-0.191552,-0.051591,-0.052059,1.404895,-0.048995,-0.049968,-0.0519,-0.107753,-0.092259,-0.077784,0.051477,0.136414,-0.20838,-0.078441,-0.092422,-0.086884,-0.120014,-0.091877,-0.48499,-0.031992,-0.217649,-0.405855,-0.081045,-0.182374,-1.072356,-0.269733,-0.083521,-0.402388,-0.293491,-0.144731,-0.086693,-0.088078,-0.027331,-0.097397,-0.096889,-0.085793,-0.090494,1.712022


In [199]:
# scale X matrices for testing data
testscaler = StandardScaler()
testXscaled = testscaler.fit_transform(testfreqs)
testXscaled = pd.DataFrame(testXscaled, columns = testfreqs.columns)

In [200]:
# simple model
# train model
logist = LogisticRegression(C = .1, max_iter = 1000) 
logist.fit(trainXscaled, train_y)

# test model
predictions = logist.predict(testXscaled)
sum(predictions == test_y) / len(predictions)

0.7372708757637475

In [201]:
# more accurate refelction of performance of model
f1_score(test_y, predictions)

0.818565400843882

In [202]:
# balance of classes -- m vs. f
dialogue_no_missing_ids.gender.value_counts()

m    2006
f     941
Name: gender, dtype: int64

In [203]:
# handling imbalanced classes
# weight classes inversely to frequency -- female char error counts more

# train
logist = LogisticRegression(C = .1, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)

# test
predictions = logist.predict(testXscaled)
f1_score(test_y, predictions)

0.8084901109503135

In [204]:
# cross validation -- select best C parameter using training data
for c_param in [.0001, .001, .01, .1, 1, 10, 100, 1000]:
    logist = LogisticRegression(C = c_param, max_iter = 1000, class_weight = 'balanced') 
    results = cross_validate(logist, trainXscaled, train_y, cv = 5, scoring = 'f1')
    print('C parameter:', c_param)
    print('Mean f1:', np.mean(results['test_score']))
    print()

C parameter: 0.0001
Mean f1: 0.8199803011926845

C parameter: 0.001
Mean f1: 0.825040016318462

C parameter: 0.01
Mean f1: 0.8231468531038226

C parameter: 0.1
Mean f1: 0.8186199686527853

C parameter: 1
Mean f1: 0.8178046311407339

C parameter: 10
Mean f1: 0.8177362174184113

C parameter: 100
Mean f1: 0.8179250994919499

C parameter: 1000
Mean f1: 0.8149312829659167



In [205]:
# use best C param to train whole training set
# train
logist = LogisticRegression(C = .01, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)

# test
predictions = logist.predict(testXscaled)
accuracy = f1_score(test_y, predictions)
print("Accuracy for entire training set is: ", accuracy)

Accuracy for entire training set is:  0.8113844669561023


In [206]:
# need to split test data -- into groups by writer gender (at least 1 female, no female writers)

reorderedchars_test = reorderedchars.iloc[0:1473, :] # test data rows

reorderedchars_test_f = reorderedchars_test[reorderedchars_test['num_of_female_writers'] >= 1]

reorderedchars_test_no_f = reorderedchars_test[reorderedchars_test['num_of_female_writers'] == 0]
reorderedchars_test_f

Unnamed: 0,index,mid,cid,cname,mname,gender,wordcount,year,genres,comedy,thriller,drama,romance,lines,num_of_female_writers,total_writers
865,865,m252,u3819,JAMIE,a walk to remember,f,1097,2002,"['drama', 'romance']",False,False,True,True,Yes. / Landon. I can't even do for myself. / C...,1,3
1149,1149,m304,u4602,ELLIE,contact,f,3325,1997,"['drama', 'mystery', 'sci-fi', 'thriller']",False,True,True,False,I don't know. Ultimately their motives may be...,1,4
2287,2301,m526,u7784,PILOT,spacejacked,m,167,1997,"['action', 'sci-fi']",False,False,False,False,Sheesh! Aye aye cap'n! / That's clear. / This...,1,2
1009,1009,m281,u4210,CORKY,bound,f,1090,1996,"['crime', 'drama', 'thriller']",False,True,True,False,I could be lying. / You can't kill me yet. / N...,2,2
2449,2463,m554,u8164,FROCK,the relic,m,970,1997,"['horror', 'mystery', 'thriller']",False,True,False,False,The virus itself might have positive applicati...,2,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2419,2433,m549,u8096,SARAH,the terminator,f,806,1984,"['action', 'sci-fi', 'thriller']",False,True,False,False,I won't let the fat kid down. / It's not brain...,1,3
1318,1321,m336,u5078,GEORGE,erin brockovich,m,818,2000,"['biography', 'drama', 'romance']",False,False,True,True,... or lemme guess -- toxic waste? / How does ...,1,1
2006,2019,m469,u7016,CAROL,peggy sue got married,f,300,1986,"['comedy', 'drama', 'fantasy', 'romance']",True,False,True,True,What a I going to do for the rest of my life? ...,1,2
1238,1241,m322,u4831,MARY,the devil and daniel webster,f,1047,2004,"['comedy', 'drama', 'fantasy']",True,False,True,False,"That's my problem, Belle. / I know -- and you ...",1,5


In [207]:
# test accuracy on chars from movies w/ at least 1 female writer

# separate out test set rows w/ at least 1 female writer
testfreqs_female = wordfreqs.loc[reorderedchars_test_f.index, :]
test_y_f = (reorderedchars_test_f['gender'][:] == 'm').astype(int)

# scale test data w/ atleast 1 female writer
testscaler = StandardScaler()
testXscaled_f = testscaler.fit_transform(testfreqs_female)
testXscaled_f = pd.DataFrame(testXscaled_f, columns = testfreqs_female.columns)

In [208]:
# train
logist = LogisticRegression(C = .01, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)

# test w/ testXscaled_f
predictions = logist.predict(testXscaled_f)
accuracy = f1_score(test_y_f, predictions)
print("Accuracy for female writers is: ", accuracy)

Accuracy for female writers is:  0.7318611987381703


In [209]:
# test accuracy on chars from movies w/ no female writers

# separate out test set rows w/ at least 1 female writer
testfreqs_no_female = wordfreqs.loc[reorderedchars_test_no_f.index, :]
test_y_no_f = (reorderedchars_test_no_f['gender'][:] == 'm').astype(int)

# scale test data w/ atleast 1 female writer
testscaler = StandardScaler()
testXscaled_no_f = testscaler.fit_transform(testfreqs_no_female)
testXscaled_no_f = pd.DataFrame(testfreqs_no_female, columns = testfreqs_no_female.columns)

In [210]:
# train
logist = LogisticRegression(C = .01, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)

# test w/ testXscaled_f
predictions = logist.predict(testXscaled_no_f)
accuracy = f1_score(test_y_no_f, predictions)
print("Accuracy for no female writers is: ", accuracy)

Accuracy for no female writers is:  0.8258562469850459


In [211]:
# classifier model predictive features
logist = LogisticRegression(C = .01, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)
coefficients = [x for x in zip(logist.coef_[0], vectorizer.get_feature_names())]

In [212]:
coefficients.sort()

In [213]:
coefficients[0:25] # female

[(-0.12025108107374957, 'oh'),
 (-0.10206068042067275, 'love'),
 (-0.09301040842264872, 'silly'),
 (-0.07151746317611529, 'loved'),
 (-0.0694035663825797, 'god'),
 (-0.06783866240018117, 'isn'),
 (-0.06771222962107103, 'wonderful'),
 (-0.0671356820910886, 'ordell'),
 (-0.06613482735134382, 'stop'),
 (-0.06563955345918705, 'annabelle'),
 (-0.06551725808660977, 'called'),
 (-0.06549096232826705, 'shoes'),
 (-0.06541184864196375, 'mmm'),
 (-0.06524072979807186, 'jus'),
 (-0.06515153338420827, 'so'),
 (-0.06342777856876482, 'hi'),
 (-0.062004426326044006, 'thinks'),
 (-0.061936688343876334, 'husband'),
 (-0.060997721205642855, 'sweet'),
 (-0.0600798582323229, 'completely'),
 (-0.05863973134482574, 'maude'),
 (-0.05753016988742734, 'adorable'),
 (-0.05634260620912123, 'wished'),
 (-0.05595891662472332, 'spoiled'),
 (-0.0555826110816018, 'fun')]

In [214]:
coefficients[-25:] # male

[(0.04780741302692985, 'kid'),
 (0.04788388963142966, 'truth'),
 (0.04797976312764822, 'blow'),
 (0.04841732347838566, 'figured'),
 (0.04853012239773524, 'honor'),
 (0.04899748260771768, 'sentence'),
 (0.04964946683332958, 'yeah'),
 (0.051853722302004475, 'problem'),
 (0.05224311916532458, 'ordinary'),
 (0.05237693419672871, 'waste'),
 (0.05267867593878816, 'lost'),
 (0.05333812124720196, 'hear'),
 (0.05357918864731696, 'running'),
 (0.05394988720891496, 'herself'),
 (0.05465661875153486, 'listen'),
 (0.0548011959373847, 'granted'),
 (0.0586562925823254, 'name'),
 (0.05915039556953916, 'claudia'),
 (0.059637219947885144, 'owns'),
 (0.06052638083978345, 'got'),
 (0.061929870263628777, 'sport'),
 (0.06441711172992791, 'business'),
 (0.08082829976755015, 'hey'),
 (0.08228974253476898, 'gotta'),
 (0.09540270041520424, 'man')]