In [154]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from pathlib import Path
import math
from collections import Counter
import re

In [155]:
pd.set_option("display.max_rows", 50, "display.max_columns", 100)

In [156]:
# dialogue data for each character
dialogpath = Path('../project_data/movie_dialogue.tsv')
dialogue = pd.read_csv(dialogpath, sep = '\t')

dialogue.nunique()

mid           600
cid          2969
cname        1925
mname         600
gender          4
wordcount    1428
year           74
genres        283
comedy          2
thriller        2
drama           2
romance         2
lines        2969
dtype: int64

In [157]:
len(dialogue)

2969

In [158]:
# movie writer gender data
genderpath = Path('../data_processing_code/dialogue_writers_gender_with_counts.csv')
genderdf = pd.read_csv(genderpath, encoding = 'utf-8')
genderdf

Unnamed: 0.1,Unnamed: 0,mid,imdb_id,writer_ids,writer_names,writer_gender,num_of_female_writers,total_num_of_writers,all_female_writers
0,0,m0,tt0147800,"['nm0527581', 'nm0809006', 'nm0000636']","['Karen McCullah', 'Kirsten Smith', 'William S...","['F', 'F', 'M']",2,3,False
1,1,m1,tt0103594,['nm0097785'],['Rose Bosch'],['F'],1,1,True
2,2,m2,tt0179626,['nm0381273'],['John Herzfeld'],['M'],0,1,False
3,3,m3,tt0062622,"['nm0000040', 'nm0002009']","['Stanley Kubrick', 'Arthur C. Clarke']","['M', 'M']",0,2,False
4,4,m4,tt0083511,"['nm0006854', 'nm0001353', 'nm0343419', 'nm021...","['Roger Spottiswoode', 'Walter Hill', 'Larry G...","['M', 'M', 'M', 'M']",0,4,False
...,...,...,...,...,...,...,...,...,...
589,589,m610,tt0032138,"['nm0486538', 'nm0753249', 'nm0941138', 'nm000...","['Noel Langley', 'Florence Ryerson', 'Edgar Al...","['M', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'M', ...",1,19,False
590,590,m611,tt0143145,"['nm0701031', 'nm0905498', 'nm0270761', 'nm000...","['Neal Purvis', 'Robert Wade', 'Bruce Feirstei...","['M', 'M', 'M', 'M']",0,4,False
591,591,m612,tt0409459,"['nm1733301', 'nm0371684', 'nm0874844']","['Dave Gibbons', 'David Hayter', 'Alex Tse']","['M', 'M', 'M']",0,3,False
592,592,m613,tt0295701,['nm0929186'],['Rich Wilkes'],['M'],0,1,False


In [159]:
# drop characters from movies with no imdb id / writer info
mids_no_imdb_id = ['m449', 'm310', 'm457', 'm488', 'm430', 'm605']


dialogue_no_missing_ids = dialogue[dialogue['mid'] != 'm449']
dialogue_no_missing_ids = dialogue_no_missing_ids[dialogue_no_missing_ids['mid'] != 'm310']
dialogue_no_missing_ids = dialogue_no_missing_ids[dialogue_no_missing_ids['mid'] != 'm457']
dialogue_no_missing_ids = dialogue_no_missing_ids[dialogue_no_missing_ids['mid'] != 'm488']
dialogue_no_missing_ids = dialogue_no_missing_ids[dialogue_no_missing_ids['mid'] != 'm430']
dialogue_no_missing_ids = dialogue_no_missing_ids[dialogue_no_missing_ids['mid'] != 'm605']

print(len(dialogue_no_missing_ids)) # 2946
dialogue_no_missing_ids.nunique() # should be 594 unique mids

2947


mid           594
cid          2947
cname        1915
mname         594
gender          4
wordcount    1421
year           74
genres        279
comedy          2
thriller        2
drama           2
romance         2
lines        2947
dtype: int64

In [160]:
dialogue_no_missing_ids = dialogue_no_missing_ids.reset_index()

In [161]:
dialogue_no_missing_ids.gender.unique()

array(['f', 'm', 'M', 'F'], dtype=object)

In [162]:
# make all gender labels lower case
dialogue_no_missing_ids.gender = dialogue_no_missing_ids.gender.str.lower()

In [163]:
dialogue_no_missing_ids.head(5)

Unnamed: 0,index,mid,cid,cname,mname,gender,wordcount,year,genres,comedy,thriller,drama,romance,lines
0,0,m0,u0,BIANCA,10 things i hate about you,f,959,1999,"['comedy', 'romance']",True,False,False,True,They do not! / I hope so. / Let's go. / Okay -...
1,1,m0,u2,CAMERON,10 things i hate about you,m,527,1999,"['comedy', 'romance']",True,False,False,True,"They do to! / She okay? / Wow / No / The ""real..."
2,2,m0,u4,JOEY,10 things i hate about you,m,278,1999,"['comedy', 'romance']",True,False,False,True,"Listen, I want to talk to you about the prom. ..."
3,3,m0,u5,KAT,10 things i hate about you,f,1217,1999,"['comedy', 'romance']",True,False,False,True,Perm? / It's just you. / What? To completely d...
4,4,m0,u6,MANDELLA,10 things i hate about you,f,157,1999,"['comedy', 'romance']",True,False,False,True,William - he asked me to meet him here. / Have...


In [164]:
# add cols for writer data -- add writer data to each char
dialogue_no_missing_ids['num_of_female_writers'] = ""
dialogue_no_missing_ids['total_writers'] = ""

In [165]:
# map movie id for each char to num of female writers and total num of writers 
for i, row in dialogue_no_missing_ids.iterrows():
    mid = dialogue_no_missing_ids.at[i, 'mid']
    match = genderdf[genderdf['mid'] == mid]
    f_num = match.iloc[0, 6] # grabs the num_of_female_writers col value
    total = match.iloc[0, 7] # grabs the total_num_of_writers col value
    dialogue_no_missing_ids.at[i, 'num_of_female_writers'] = f_num
    dialogue_no_missing_ids.at[i, 'total_writers'] = total

In [166]:
# build feature vocabulary -- using doc frequency (count of how many docs / char lines a word appears in)
vocab = Counter()

stopwords = ['the', 'of', 'and', 'she', 'her', 'he','him']

for char in dialogue_no_missing_ids['lines']:
    words = re.split('\W', char)
    lowercase = [w.lower() for w in words]
    unique_words = set([w for w in lowercase if len(w) > 1])  # get rid of one-letter words
    for w in unique_words:
        if w not in stopwords and not w.isdigit(): # don't include numbers and stopwords
            vocab[w.lower()] += 1
        
vocab = vocab.most_common(5000)   # This is a Counter() method that returns paired
                                  # keys and counts for the keys with highest counts.

In [167]:
wordfeatures = [x[0] for x in vocab]   
docfreqs = [x[1] for x in vocab]

In [168]:
vectorizer = CountVectorizer(vocabulary = wordfeatures)

sparse_counts = vectorizer.fit_transform(dialogue_no_missing_ids['lines'])
                                                            
charcounts = pd.DataFrame(sparse_counts.toarray(),
                            columns = vectorizer.get_feature_names())
charcounts.head()

Unnamed: 0,you,to,it,that,what,in,is,me,for,this,do,don,have,know,not,be,on,we,your,re,no,with,but,can,was,are,my,all,like,just,so,there,about,they,get,here,if,ll,out,up,how,at,one,think,now,want,go,right,got,ve,...,moscow,articles,critical,tempted,employed,backup,lamp,backed,glorious,toes,mainly,cursed,rum,tide,cape,yankee,skipper,sounding,buster,blessing,roman,phase,dork,lewis,hardest,prayers,approval,cept,brian,protective,reserve,morphine,hardware,represents,preparing,candidate,burial,punishment,rises,happiest,touches,depression,stool,landlord,ing,victoria,hose,trailer,sunlight,splendid
0,50,30,9,14,6,6,10,11,3,9,3,7,5,15,11,6,4,9,4,12,8,4,12,9,3,6,12,2,11,11,5,3,1,1,3,2,7,1,7,1,6,2,2,6,2,6,15,1,2,2,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,24,16,6,16,6,4,4,6,3,2,7,0,4,1,5,3,6,5,3,3,3,6,2,2,0,0,3,1,7,3,2,3,2,3,7,1,2,1,3,1,4,1,3,3,0,0,2,1,3,3,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,16,5,8,4,2,0,1,1,2,3,4,1,1,1,3,2,1,2,2,4,0,3,2,1,0,1,2,1,2,1,2,0,1,1,1,1,1,1,3,3,1,2,1,1,0,3,2,0,3,2,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,69,32,20,22,17,7,10,21,8,13,13,11,4,10,8,5,7,6,8,11,8,6,4,7,12,6,15,4,11,8,9,2,3,3,6,1,3,0,6,5,3,7,6,7,5,10,7,4,3,2,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,8,4,2,3,2,0,0,2,2,1,0,1,6,0,2,0,0,4,0,1,0,2,1,0,2,0,0,0,1,1,1,0,0,0,0,1,0,1,0,0,0,0,1,2,0,0,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [169]:
# calculate word freq from word counts and factor out dialogue length
dwordcount = dialogue_no_missing_ids['wordcount']

In [170]:
# gives us relative word freq for each char
wordfreqs = charcounts.divide(dwordcount, axis = 'rows')
len(wordfreqs)

2947

In [171]:
wordfreqs['#dwordcount'] = dwordcount
wordfreqs.head(20)

Unnamed: 0,you,to,it,that,what,in,is,me,for,this,do,don,have,know,not,be,on,we,your,re,no,with,but,can,was,are,my,all,like,just,so,there,about,they,get,here,if,ll,out,up,how,at,one,think,now,want,go,right,got,ve,...,articles,critical,tempted,employed,backup,lamp,backed,glorious,toes,mainly,cursed,rum,tide,cape,yankee,skipper,sounding,buster,blessing,roman,phase,dork,lewis,hardest,prayers,approval,cept,brian,protective,reserve,morphine,hardware,represents,preparing,candidate,burial,punishment,rises,happiest,touches,depression,stool,landlord,ing,victoria,hose,trailer,sunlight,splendid,#dwordcount
0,0.052138,0.031283,0.009385,0.014599,0.006257,0.006257,0.010428,0.01147,0.003128,0.009385,0.003128,0.007299,0.005214,0.015641,0.01147,0.006257,0.004171,0.009385,0.004171,0.012513,0.008342,0.004171,0.012513,0.009385,0.003128,0.006257,0.012513,0.002086,0.01147,0.01147,0.005214,0.003128,0.001043,0.001043,0.003128,0.002086,0.007299,0.001043,0.007299,0.001043,0.006257,0.002086,0.002086,0.006257,0.002086,0.006257,0.015641,0.001043,0.002086,0.002086,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,959
1,0.045541,0.030361,0.011385,0.030361,0.011385,0.00759,0.00759,0.011385,0.005693,0.003795,0.013283,0.0,0.00759,0.001898,0.009488,0.005693,0.011385,0.009488,0.005693,0.005693,0.005693,0.011385,0.003795,0.003795,0.0,0.0,0.005693,0.001898,0.013283,0.005693,0.003795,0.005693,0.003795,0.005693,0.013283,0.001898,0.003795,0.001898,0.005693,0.001898,0.00759,0.001898,0.005693,0.005693,0.0,0.0,0.003795,0.001898,0.005693,0.005693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,527
2,0.057554,0.017986,0.028777,0.014388,0.007194,0.0,0.003597,0.003597,0.007194,0.010791,0.014388,0.003597,0.003597,0.003597,0.010791,0.007194,0.003597,0.007194,0.007194,0.014388,0.0,0.010791,0.007194,0.003597,0.0,0.003597,0.007194,0.003597,0.007194,0.003597,0.007194,0.0,0.003597,0.003597,0.003597,0.003597,0.003597,0.003597,0.010791,0.010791,0.003597,0.007194,0.003597,0.003597,0.0,0.010791,0.007194,0.0,0.010791,0.007194,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,278
3,0.056697,0.026294,0.016434,0.018077,0.013969,0.005752,0.008217,0.017256,0.006574,0.010682,0.010682,0.009039,0.003287,0.008217,0.006574,0.004108,0.005752,0.00493,0.006574,0.009039,0.006574,0.00493,0.003287,0.005752,0.00986,0.00493,0.012325,0.003287,0.009039,0.006574,0.007395,0.001643,0.002465,0.002465,0.00493,0.000822,0.002465,0.0,0.00493,0.004108,0.002465,0.005752,0.00493,0.005752,0.004108,0.008217,0.005752,0.003287,0.002465,0.001643,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1217
4,0.050955,0.025478,0.012739,0.019108,0.012739,0.0,0.0,0.012739,0.012739,0.006369,0.0,0.006369,0.038217,0.0,0.012739,0.0,0.0,0.025478,0.0,0.006369,0.0,0.012739,0.006369,0.0,0.012739,0.0,0.0,0.0,0.006369,0.006369,0.006369,0.0,0.0,0.0,0.0,0.006369,0.0,0.006369,0.0,0.0,0.0,0.0,0.006369,0.012739,0.0,0.0,0.012739,0.006369,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,157
5,0.050955,0.022293,0.011146,0.009554,0.009554,0.006369,0.007962,0.003185,0.011146,0.003185,0.004777,0.001592,0.006369,0.004777,0.003185,0.001592,0.004777,0.014331,0.007962,0.012739,0.003185,0.022293,0.011146,0.006369,0.001592,0.001592,0.003185,0.001592,0.003185,0.012739,0.011146,0.003185,0.001592,0.003185,0.004777,0.001592,0.003185,0.006369,0.017516,0.006369,0.001592,0.003185,0.004777,0.007962,0.001592,0.0,0.011146,0.004777,0.003185,0.001592,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,628
6,0.064265,0.031159,0.022395,0.011685,0.017527,0.009737,0.003895,0.019474,0.00779,0.003895,0.00779,0.009737,0.008763,0.008763,0.004869,0.002921,0.006816,0.000974,0.009737,0.010711,0.004869,0.009737,0.003895,0.004869,0.005842,0.003895,0.006816,0.001947,0.006816,0.004869,0.00779,0.006816,0.003895,0.000974,0.00779,0.0,0.001947,0.004869,0.004869,0.009737,0.002921,0.001947,0.002921,0.0,0.0,0.0,0.002921,0.002921,0.005842,0.005842,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1027
7,0.065574,0.027322,0.016393,0.019126,0.021858,0.002732,0.013661,0.021858,0.008197,0.010929,0.002732,0.013661,0.005464,0.008197,0.016393,0.005464,0.008197,0.010929,0.013661,0.019126,0.008197,0.0,0.002732,0.005464,0.005464,0.002732,0.013661,0.002732,0.002732,0.005464,0.002732,0.0,0.008197,0.0,0.002732,0.002732,0.002732,0.005464,0.0,0.002732,0.0,0.005464,0.0,0.002732,0.0,0.005464,0.005464,0.005464,0.002732,0.010929,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,366
8,0.039432,0.037855,0.008675,0.009464,0.014196,0.007098,0.012618,0.01183,0.008675,0.001577,0.008675,0.006309,0.013407,0.006309,0.006309,0.011041,0.003155,0.014196,0.009464,0.002366,0.007098,0.007886,0.000789,0.008675,0.001577,0.012618,0.006309,0.009464,0.001577,0.000789,0.003155,0.005521,0.003155,0.007098,0.0,0.003155,0.004732,0.003943,0.002366,0.001577,0.004732,0.000789,0.003943,0.003943,0.002366,0.005521,0.002366,0.002366,0.0,0.001577,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1268
9,0.101942,0.033981,0.004854,0.004854,0.009709,0.009709,0.014563,0.0,0.0,0.0,0.004854,0.014563,0.0,0.0,0.004854,0.019417,0.0,0.004854,0.009709,0.009709,0.0,0.004854,0.004854,0.009709,0.0,0.019417,0.019417,0.0,0.0,0.0,0.024272,0.0,0.0,0.0,0.004854,0.0,0.009709,0.0,0.0,0.0,0.014563,0.0,0.0,0.0,0.004854,0.0,0.009709,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,206


In [172]:
# shuffle chars -- in order to draw random sample from
wordfreqs = wordfreqs.sample(frac = 1)
wordfreqs.head()

Unnamed: 0,you,to,it,that,what,in,is,me,for,this,do,don,have,know,not,be,on,we,your,re,no,with,but,can,was,are,my,all,like,just,so,there,about,they,get,here,if,ll,out,up,how,at,one,think,now,want,go,right,got,ve,...,articles,critical,tempted,employed,backup,lamp,backed,glorious,toes,mainly,cursed,rum,tide,cape,yankee,skipper,sounding,buster,blessing,roman,phase,dork,lewis,hardest,prayers,approval,cept,brian,protective,reserve,morphine,hardware,represents,preparing,candidate,burial,punishment,rises,happiest,touches,depression,stool,landlord,ing,victoria,hose,trailer,sunlight,splendid,#dwordcount
2356,0.047692,0.023077,0.007692,0.012308,0.004615,0.007692,0.004615,0.012308,0.012308,0.009231,0.009231,0.006154,0.009231,0.007692,0.006154,0.001538,0.004615,0.013846,0.009231,0.007692,0.009231,0.007692,0.003077,0.004615,0.001538,0.003077,0.009231,0.006154,0.003077,0.003077,0.003077,0.001538,0.001538,0.003077,0.009231,0.006154,0.001538,0.001538,0.003077,0.010769,0.007692,0.003077,0.007692,0.0,0.003077,0.004615,0.001538,0.0,0.001538,0.003077,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,650
924,0.039695,0.035115,0.01374,0.01374,0.010687,0.015267,0.018321,0.010687,0.006107,0.018321,0.00916,0.007634,0.003053,0.00458,0.010687,0.003053,0.007634,0.010687,0.003053,0.003053,0.00458,0.007634,0.001527,0.007634,0.0,0.007634,0.010687,0.00458,0.00458,0.003053,0.006107,0.00458,0.00458,0.006107,0.007634,0.007634,0.0,0.00916,0.00458,0.003053,0.003053,0.00458,0.00458,0.006107,0.001527,0.001527,0.001527,0.003053,0.003053,0.00458,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,655
1,0.045541,0.030361,0.011385,0.030361,0.011385,0.00759,0.00759,0.011385,0.005693,0.003795,0.013283,0.0,0.00759,0.001898,0.009488,0.005693,0.011385,0.009488,0.005693,0.005693,0.005693,0.011385,0.003795,0.003795,0.0,0.0,0.005693,0.001898,0.013283,0.005693,0.003795,0.005693,0.003795,0.005693,0.013283,0.001898,0.003795,0.001898,0.005693,0.001898,0.00759,0.001898,0.005693,0.005693,0.0,0.0,0.003795,0.001898,0.005693,0.005693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,527
2793,0.061375,0.023779,0.019923,0.019602,0.013496,0.017031,0.004499,0.008676,0.006748,0.003856,0.005141,0.008997,0.005784,0.007391,0.008355,0.004177,0.006427,0.004177,0.006427,0.00964,0.002892,0.004499,0.005141,0.004177,0.006748,0.001285,0.002249,0.003535,0.007391,0.003856,0.004499,0.002892,0.002892,0.00482,0.006748,0.007069,0.003535,0.004499,0.004177,0.005784,0.006105,0.003856,0.004499,0.002892,0.003213,0.005784,0.003535,0.00482,0.004177,0.002249,...,0.000321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3112
696,0.036934,0.018815,0.025784,0.014634,0.007666,0.007666,0.008362,0.003484,0.012544,0.006969,0.002787,0.009756,0.004878,0.007666,0.006272,0.006969,0.004181,0.028571,0.004181,0.013937,0.006969,0.004181,0.005575,0.006272,0.001394,0.004878,0.003484,0.002787,0.008362,0.006969,0.002091,0.002787,0.002787,0.01115,0.009756,0.009756,0.003484,0.006969,0.005575,0.010453,0.002091,0.002787,0.004181,0.004181,0.003484,0.002787,0.005575,0.005575,0.005575,0.002091,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000697,0.0,0.0,1435


In [173]:
# so we can map from the tweet in the word matrix to the tweet in the original tweets df
reorderedchars = dialogue_no_missing_ids.loc[wordfreqs.index, : ]
reorderedchars.head()

Unnamed: 0,index,mid,cid,cname,mname,gender,wordcount,year,genres,comedy,thriller,drama,romance,lines,num_of_female_writers,total_writers
2356,2370,m538,u7950,KANSAS,sugar & spice,f,650,2001,"['comedy', 'crime']",True,False,False,False,"Well, use your head, ""Stalker Betty."" / NO-GAH...",1,1
924,924,m265,u3983,DELIA,beetle juice,f,655,1988,"['comedy', 'fantasy']",True,False,False,False,Not these people! They are ruthless! / I thin...,0,3
1,1,m0,u2,CAMERON,10 things i hate about you,m,527,1999,"['comedy', 'romance']",True,False,False,True,"They do to! / She okay? / Wow / No / The ""real...",2,3
2793,2815,m70,u1071,CHILI,get shorty,m,3112,1995,"['comedy', 'crime', 'thriller']",True,True,False,False,"You're kidding me, right? / I reserved a Cadil...",0,2
696,696,m222,u3345,EARL,tremors,m,1435,1990,"['action', 'comedy', 'horror', 'thriller']",True,True,False,False,What do <u>you</u> use it for? / What the hell...,0,3


In [174]:
# separate test and training data -- about 50 / 50 split

# test set
testfreqs = wordfreqs.iloc[0: 1473, : ]
test_y = (reorderedchars['gender'][0: 1473] == 'm').astype(int)  
test_y[0:10]

2356    0
924     0
1       1
2793    1
696     1
2136    0
170     1
1970    1
1742    0
2796    1
Name: gender, dtype: int64

In [175]:
# all remaining rows as the training set  
trainfreqs = wordfreqs.iloc[1473 : , : ]
train_y = (reorderedchars['gender'][1473 : ] == 'm').astype(int) 
train_y[0:10]

7       1
1968    0
2833    1
2717    1
1352    1
787     1
2868    1
482     0
2558    0
437     1
Name: gender, dtype: int64

In [176]:
# scale X matrices for training data
# normalizes cols -- col = (col-col.mean) / col.std (like a z score)
trainscaler = StandardScaler()
trainXscaled = trainscaler.fit_transform(trainfreqs)
trainXscaled = pd.DataFrame(trainXscaled, columns = trainfreqs.columns)
trainXscaled.head()

Unnamed: 0,you,to,it,that,what,in,is,me,for,this,do,don,have,know,not,be,on,we,your,re,no,with,but,can,was,are,my,all,like,just,so,there,about,they,get,here,if,ll,out,up,how,at,one,think,now,want,go,right,got,ve,...,articles,critical,tempted,employed,backup,lamp,backed,glorious,toes,mainly,cursed,rum,tide,cape,yankee,skipper,sounding,buster,blessing,roman,phase,dork,lewis,hardest,prayers,approval,cept,brian,protective,reserve,morphine,hardware,represents,preparing,candidate,burial,punishment,rises,happiest,touches,depression,stool,landlord,ing,victoria,hose,trailer,sunlight,splendid,#dwordcount
0,0.983898,0.278752,-0.476618,0.680675,1.500901,-1.409882,0.764868,1.710217,0.190187,0.664045,-0.892373,1.044562,-0.283499,0.253442,2.170865,-0.150268,0.484485,0.244426,1.146733,2.229419,0.388632,-1.435264,-0.671866,-0.041651,-0.14517,-0.623616,1.322793,-0.534537,-0.510182,0.126167,-0.376509,-1.197084,1.037861,-0.984528,-0.449397,-0.383944,-0.391308,0.255788,-1.205038,-0.255446,-1.160095,0.71484,-1.04375,-0.193886,-1.051513,0.598236,0.718538,0.771487,-0.228916,2.317013,...,-0.058729,-0.064739,-0.065412,-0.050194,-0.063351,-0.065157,-0.046367,-0.063902,-0.039,-0.05315,-0.061715,-0.063716,-0.05369,-0.064971,-0.059642,-0.059906,-0.066128,-0.057137,-0.074044,-0.056131,-0.069532,-0.072485,-0.067309,-0.066624,-0.066824,-0.071028,-0.05508,-0.047644,-0.063554,-0.051452,-0.067294,-0.06215,-0.053735,-0.059514,-0.063794,-0.065918,-0.07431,-0.06402,-0.074777,-0.076231,-0.06159,-0.077366,-0.054953,-0.065396,-0.064172,-0.079674,-0.077766,-0.063117,-0.066083,-0.63424
1,-0.495297,0.104123,0.305611,0.342668,-0.574063,1.423448,0.232497,-0.308317,-0.117741,-0.248831,-0.187604,-0.272133,-0.214321,-0.020564,-0.437624,1.101025,-0.914593,-0.654905,-0.351742,-0.456036,1.595773,1.124985,0.872972,-0.576142,0.62423,-0.525501,-0.209499,-0.118118,-0.274096,0.718439,0.380818,-0.043305,0.955209,0.362224,-0.196353,-0.552668,-0.103254,0.204273,-0.751261,-0.127092,0.199634,-0.239193,-0.051765,-0.057287,0.054899,-0.2059,0.169411,0.201124,0.420595,0.904656,...,-0.058729,-0.064739,-0.065412,-0.050194,-0.063351,-0.065157,-0.046367,-0.063902,-0.039,4.485563,-0.061715,-0.063716,-0.05369,-0.064971,-0.059642,-0.059906,-0.066128,-0.057137,-0.074044,-0.056131,-0.069532,-0.072485,-0.067309,-0.066624,-0.066824,-0.071028,-0.05508,-0.047644,-0.063554,-0.051452,-0.067294,-0.06215,-0.053735,-0.059514,-0.063794,-0.065918,-0.07431,-0.06402,-0.074777,-0.076231,-0.06159,-0.077366,-0.054953,-0.065396,-0.064172,-0.079674,-0.077766,-0.063117,-0.066083,1.339571
2,1.637452,0.649342,-0.500485,0.778142,0.123654,-0.604064,-0.333361,0.110423,-0.446538,-0.074998,2.814915,0.207733,1.61608,0.41447,-0.56896,-0.167275,-0.985036,-0.442451,-0.241117,-0.668747,-1.256682,0.50897,-0.45689,1.192715,-0.503532,0.404892,0.471006,-1.304182,0.150344,-0.328923,-0.607797,-0.704238,-0.713277,-0.984528,-0.219502,-0.151197,0.414614,-0.682243,-1.205038,0.547708,1.744001,0.085972,-0.478771,5.855536,0.838939,1.599773,-0.41222,-0.403001,-0.930092,1.724244,...,-0.058729,-0.064739,-0.065412,-0.050194,-0.063351,-0.065157,-0.046367,-0.063902,-0.039,-0.05315,-0.061715,-0.063716,-0.05369,-0.064971,-0.059642,-0.059906,-0.066128,-0.057137,-0.074044,-0.056131,-0.069532,-0.072485,-0.067309,-0.066624,-0.066824,-0.071028,-0.05508,-0.047644,-0.063554,-0.051452,-0.067294,-0.06215,-0.053735,-0.059514,-0.063794,-0.065918,-0.07431,-0.06402,-0.074777,-0.076231,-0.06159,-0.077366,-0.054953,-0.065396,-0.064172,-0.079674,-0.077766,-0.063117,-0.066083,-0.389765
3,0.993352,-0.074502,-0.771798,0.410598,-0.204654,-0.653516,0.795373,-0.449167,-1.673072,3.979149,0.611819,-0.784008,-1.446677,-0.004724,5.328655,0.209641,-0.603391,-0.253177,0.595896,-1.340095,1.521564,3.240316,-0.490922,4.304785,-0.529807,0.341774,-1.169804,-1.304182,0.528474,-1.205569,-0.196179,-0.248909,1.635581,0.912775,-1.175655,-1.11921,0.875978,-0.257063,-1.205038,-0.03593,2.192176,0.039863,0.0432,-1.070705,0.160819,-0.952891,2.234929,-0.979225,-0.930092,-1.028969,...,-0.058729,-0.064739,-0.065412,-0.050194,-0.063351,-0.065157,-0.046367,-0.063902,-0.039,-0.05315,-0.061715,-0.063716,-0.05369,-0.064971,-0.059642,-0.059906,-0.066128,-0.057137,-0.074044,-0.056131,-0.069532,-0.072485,-0.067309,-0.066624,-0.066824,-0.071028,-0.05508,-0.047644,-0.063554,-0.051452,-0.067294,-0.06215,-0.053735,-0.059514,-0.063794,-0.065918,-0.07431,-0.06402,-0.074777,-0.076231,-0.06159,-0.077366,-0.054953,-0.065396,-0.064172,-0.079674,-0.077766,-0.063117,-0.066083,-0.733317
4,-0.008104,-0.721997,0.485879,0.695252,0.344315,0.266562,0.193836,0.336954,0.043711,0.539318,0.25813,0.195669,-0.642875,-0.224093,0.130892,-0.058451,0.336487,0.208798,-0.397764,0.646061,0.328196,-0.693078,0.184913,-0.035131,-0.27153,0.091791,0.396091,0.049633,0.344654,1.220632,0.137206,-0.350464,0.767544,-0.85904,0.101841,0.266513,-0.387051,-0.088373,0.176182,-0.044168,0.059366,0.493098,0.034617,0.691963,0.03097,0.021556,-0.013998,0.670519,0.03899,0.442436,...,-0.058729,-0.064739,-0.065412,-0.050194,-0.063351,-0.065157,-0.046367,-0.063902,-0.039,-0.05315,-0.061715,-0.063716,-0.05369,-0.064971,-0.059642,-0.059906,-0.066128,-0.057137,-0.074044,4.154938,-0.069532,-0.072485,-0.067309,-0.066624,-0.066824,-0.071028,-0.05508,-0.047644,-0.063554,-0.051452,-0.067294,-0.06215,-0.053735,-0.059514,-0.063794,-0.065918,-0.07431,-0.06402,-0.074777,-0.076231,-0.06159,-0.077366,-0.054953,-0.065396,-0.064172,-0.079674,-0.077766,-0.063117,-0.066083,2.643007


In [177]:
# scale X matrices for testing data
testscaler = StandardScaler()
testXscaled = testscaler.fit_transform(testfreqs)
testXscaled = pd.DataFrame(testXscaled, columns = testfreqs.columns)

In [178]:
# simple model
# train model
logist = LogisticRegression(C = .1, max_iter = 1000) 
logist.fit(trainXscaled, train_y)

# test model
predictions = logist.predict(testXscaled)
sum(predictions == test_y) / len(predictions)

0.7216564833672776

In [179]:
# more accurate refelction of performance of model
f1_score(test_y, predictions)

0.8084112149532708

In [180]:
# balance of classes -- m vs. f
dialogue_no_missing_ids.gender.value_counts()

m    2006
f     941
Name: gender, dtype: int64

In [181]:
# handling imbalanced classes
# weight classes inversely to frequency -- female char error counts more

# train
logist = LogisticRegression(C = .1, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)

# test
predictions = logist.predict(testXscaled)
f1_score(test_y, predictions)

0.798654493032196

In [182]:
# cross validation -- select best C parameter using training data
for c_param in [.0001, .001, .01, .1, 1, 10, 100, 1000]:
    logist = LogisticRegression(C = c_param, max_iter = 1000, class_weight = 'balanced') 
    results = cross_validate(logist, trainXscaled, train_y, cv = 5, scoring = 'f1')
    print('C parameter:', c_param)
    print('Mean f1:', np.mean(results['test_score']))
    print()

C parameter: 0.0001
Mean f1: 0.8264233973524501

C parameter: 0.001
Mean f1: 0.8287093976434647

C parameter: 0.01
Mean f1: 0.8210241805891452

C parameter: 0.1
Mean f1: 0.8167796053290086

C parameter: 1
Mean f1: 0.8146994073464662

C parameter: 10
Mean f1: 0.8148645811008606

C parameter: 100
Mean f1: 0.812957339878816

C parameter: 1000
Mean f1: 0.809417161663126



In [183]:
# use best C param to train whole training set
# train
logist = LogisticRegression(C = .01, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)

# test
predictions = logist.predict(testXscaled)
accuracy = f1_score(test_y, predictions)
print("Accuracy for entire training set is: ", accuracy)

Accuracy for entire training set is:  0.8028915662650602


In [184]:
# need to split test data -- into groups by writer gender (at least 1 female, no female writers)

reorderedchars_test = reorderedchars.iloc[0:1473, :] # test data rows

reorderedchars_test_f = reorderedchars_test[reorderedchars_test['num_of_female_writers'] >= 1]

reorderedchars_test_no_f = reorderedchars_test[reorderedchars_test['num_of_female_writers'] == 0]
reorderedchars_test_f

Unnamed: 0,index,mid,cid,cname,mname,gender,wordcount,year,genres,comedy,thriller,drama,romance,lines,num_of_female_writers,total_writers
2356,2370,m538,u7950,KANSAS,sugar & spice,f,650,2001,"['comedy', 'crime']",True,False,False,False,"Well, use your head, ""Stalker Betty."" / NO-GAH...",1,1
1,1,m0,u2,CAMERON,10 things i hate about you,m,527,1999,"['comedy', 'romance']",True,False,False,True,"They do to! / She okay? / Wow / No / The ""real...",2,3
2136,2150,m499,u7372,ANNIE,sleepless in seattle,f,1591,1993,"['comedy', 'romance', 'drama']",True,False,True,True,"You turn it on, you open it and you stand in f...",1,3
1777,1780,m427,u6409,LENA,love & basketball,f,150,2000,"['drama', 'romance', 'sport']",False,False,True,True,He's fine. / The doctor said it was okay. Any...,1,1
1323,1326,m337,u5105,YODA,star wars: the empire strikes back,m,536,1982,"['animation', 'adventure', 'action', 'fantasy']",False,False,False,False,"No ... there is another. / Told you, I did. R...",1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730,1733,m42,u664,ILSA,casablanca,f,1374,1942,"['drama', 'romance', 'war']",False,False,True,True,And the other alternative? / What are they? / ...,1,6
2519,2533,m569,u8394,GUILD,the thin man,m,808,1934,"['comedy', 'crime', 'drama', 'mystery', 'roman...",True,False,True,True,Shut up. / You mean that body-----? / Why didn...,1,3
1631,1634,m399,u6013,LOUIS,interview with the vampire: the vampire chroni...,m,1926,1994,"['drama', 'fantasy']",False,False,True,False,"What a pair we are. We deserve each other, don...",1,1
2330,2344,m533,u7885,RACHEL,stepmom,f,2546,1998,"['comedy', 'drama']",True,False,True,False,"A <u>suitable</u> boy, will be at <u>this</u> ...",3,5


In [185]:
# test accuracy on chars from movies w/ at least 1 female writer

# separate out test set rows w/ at least 1 female writer
testfreqs_female = wordfreqs.loc[reorderedchars_test_f.index, :]
test_y_f = (reorderedchars_test_f['gender'][:] == 'm').astype(int)

# scale test data w/ atleast 1 female writer
testscaler = StandardScaler()
testXscaled_f = testscaler.fit_transform(testfreqs_female)
testXscaled_f = pd.DataFrame(testXscaled_f, columns = testfreqs_female.columns)

In [186]:
# train
logist = LogisticRegression(C = .01, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)

# test w/ testXscaled_f
predictions = logist.predict(testXscaled_f)
accuracy = f1_score(test_y_f, predictions)
print("Accuracy for female writers is: ", accuracy)

Accuracy for female writers is:  0.7232704402515723


In [187]:
# test accuracy on chars from movies w/ no female writers

# separate out test set rows w/ at least 1 female writer
testfreqs_no_female = wordfreqs.loc[reorderedchars_test_no_f.index, :]
test_y_no_f = (reorderedchars_test_no_f['gender'][:] == 'm').astype(int)

# scale test data w/ atleast 1 female writer
testscaler = StandardScaler()
testXscaled_no_f = testscaler.fit_transform(testfreqs_no_female)
testXscaled_no_f = pd.DataFrame(testfreqs_no_female, columns = testfreqs_no_female.columns)

In [188]:
# train
logist = LogisticRegression(C = .01, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)

# test w/ testXscaled_f
predictions = logist.predict(testXscaled_no_f)
accuracy = f1_score(test_y_no_f, predictions)
print("Accuracy for no female writers is: ", accuracy)

Accuracy for no female writers is:  0.817441303306181


In [189]:
# classifier model predictive features
logist = LogisticRegression(C = .01, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)
coefficients = [x for x in zip(logist.coef_[0], vectorizer.get_feature_names())]

In [190]:
coefficients.sort()

In [191]:
coefficients[0:25] # female

[(-0.12293272269011926, 'oh'),
 (-0.10467454336203595, 'so'),
 (-0.08830751045598617, 'scared'),
 (-0.07893957407509333, 'love'),
 (-0.07508727753717478, 'horrible'),
 (-0.0736215172984294, 'honey'),
 (-0.07112436625007935, 'just'),
 (-0.07036371505175973, 'child'),
 (-0.06974639836829002, 'decided'),
 (-0.06847894953437114, 'dna'),
 (-0.06816901719297958, 'god'),
 (-0.06693226179253335, 'cheek'),
 (-0.06606741706124424, 'loved'),
 (-0.0654420545300063, 'because'),
 (-0.06538035572080945, 'laundry'),
 (-0.06323717581217568, 'tank'),
 (-0.06310041637841565, 'silly'),
 (-0.06262011184385084, 'husband'),
 (-0.0616288764053161, 'farm'),
 (-0.06080064713372368, 'psychological'),
 (-0.05909761158954607, 'bed'),
 (-0.058476443210239835, 'pregnant'),
 (-0.05799685753420104, 'alike'),
 (-0.057155546408988685, 'romantic'),
 (-0.05651494784928393, 'shopping')]

In [192]:
coefficients[-25:] # male

[(0.0504129828841039, 'fits'),
 (0.05076084463111386, 'played'),
 (0.05079590505005346, 'guide'),
 (0.05129129108768319, 'catch'),
 (0.05139603771414082, 'diamonds'),
 (0.05178896480854318, 'cigar'),
 (0.052123269698358364, 'we'),
 (0.05253877432640063, 'hell'),
 (0.05327537888918756, 'jumped'),
 (0.053969127411494636, 'carefully'),
 (0.054163743102122894, 'pretty'),
 (0.05430773817469897, 'listen'),
 (0.05436318817012273, 'cart'),
 (0.05444098002507486, 'ah'),
 (0.054723715055988345, 'eh'),
 (0.05489513044390223, 'problem'),
 (0.055940684723530754, 'abuse'),
 (0.05734270221062403, 'trying'),
 (0.05943541374213509, 'bible'),
 (0.05968382998084896, 'got'),
 (0.06073334903958507, 'uh'),
 (0.06134715383698294, 'suddenly'),
 (0.06253067469163573, 'man'),
 (0.06834179757585543, 'throwing'),
 (0.08877934215172156, 'hey')]