In [46]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from pathlib import Path
import math
from collections import Counter
import re

In [47]:
pd.set_option("display.max_rows", 20, "display.max_columns", 100)

In [48]:
# dialogue data for each character
dialogpath = Path('../project_data/movie_dialogue.tsv')
dialogue = pd.read_csv(dialogpath, sep = '\t')

dialogue.nunique()

mid           600
cid          2969
cname        1925
mname         600
gender          4
wordcount    1428
year           74
genres        283
comedy          2
thriller        2
drama           2
romance         2
lines        2969
dtype: int64

In [49]:
len(dialogue)

2969

In [50]:
# movie writer gender data
genderpath = Path('../data_processing_code/dialogue_writers_gender_with_counts.csv')
genderdf = pd.read_csv(genderpath, encoding = 'utf-8')
genderdf

Unnamed: 0.1,Unnamed: 0,mid,imdb_id,writer_ids,writer_names,writer_gender,num_of_female_writers,total_num_of_writers,all_female_writers
0,0,m0,tt0147800,"['nm0527581', 'nm0809006', 'nm0000636']","['Karen McCullah', 'Kirsten Smith', 'William S...","['F', 'F', 'M']",2,3,False
1,1,m1,tt0103594,['nm0097785'],['Rose Bosch'],['F'],1,1,True
2,2,m2,tt0179626,['nm0381273'],['John Herzfeld'],['M'],0,1,False
3,3,m3,tt0062622,"['nm0000040', 'nm0002009']","['Stanley Kubrick', 'Arthur C. Clarke']","['M', 'M']",0,2,False
4,4,m4,tt0083511,"['nm0006854', 'nm0001353', 'nm0343419', 'nm021...","['Roger Spottiswoode', 'Walter Hill', 'Larry G...","['M', 'M', 'M', 'M']",0,4,False
...,...,...,...,...,...,...,...,...,...
589,589,m610,tt0032138,"['nm0486538', 'nm0753249', 'nm0941138', 'nm000...","['Noel Langley', 'Florence Ryerson', 'Edgar Al...","['M', 'F', 'M', 'M', 'M', 'M', 'M', 'M', 'M', ...",1,19,False
590,590,m611,tt0143145,"['nm0701031', 'nm0905498', 'nm0270761', 'nm000...","['Neal Purvis', 'Robert Wade', 'Bruce Feirstei...","['M', 'M', 'M', 'M']",0,4,False
591,591,m612,tt0409459,"['nm1733301', 'nm0371684', 'nm0874844']","['Dave Gibbons', 'David Hayter', 'Alex Tse']","['M', 'M', 'M']",0,3,False
592,592,m613,tt0295701,['nm0929186'],['Rich Wilkes'],['M'],0,1,False


In [51]:
# drop characters from movies with no imdb id / writer info
mids_no_imdb_id = ['m449', 'm310', 'm457', 'm488', 'm430', 'm605']


dialogue_no_missing_ids = dialogue[dialogue['mid'] != 'm449']
dialogue_no_missing_ids = dialogue_no_missing_ids[dialogue_no_missing_ids['mid'] != 'm310']
dialogue_no_missing_ids = dialogue_no_missing_ids[dialogue_no_missing_ids['mid'] != 'm457']
dialogue_no_missing_ids = dialogue_no_missing_ids[dialogue_no_missing_ids['mid'] != 'm488']
dialogue_no_missing_ids = dialogue_no_missing_ids[dialogue_no_missing_ids['mid'] != 'm430']
dialogue_no_missing_ids = dialogue_no_missing_ids[dialogue_no_missing_ids['mid'] != 'm605']

print(len(dialogue_no_missing_ids)) # 2947
dialogue_no_missing_ids.nunique() # should be 594 unique mids

2947


mid           594
cid          2947
cname        1915
mname         594
gender          4
wordcount    1421
year           74
genres        279
comedy          2
thriller        2
drama           2
romance         2
lines        2947
dtype: int64

In [52]:
dialogue_no_missing_ids = dialogue_no_missing_ids.reset_index()

In [53]:
dialogue_no_missing_ids.gender.unique()

array(['f', 'm', 'M', 'F'], dtype=object)

In [54]:
# make all gender labels lower case
dialogue_no_missing_ids.gender = dialogue_no_missing_ids.gender.str.lower()

In [55]:
dialogue_no_missing_ids.head(5)

Unnamed: 0,index,mid,cid,cname,mname,gender,wordcount,year,genres,comedy,thriller,drama,romance,lines
0,0,m0,u0,BIANCA,10 things i hate about you,f,959,1999,"['comedy', 'romance']",True,False,False,True,They do not! / I hope so. / Let's go. / Okay -...
1,1,m0,u2,CAMERON,10 things i hate about you,m,527,1999,"['comedy', 'romance']",True,False,False,True,"They do to! / She okay? / Wow / No / The ""real..."
2,2,m0,u4,JOEY,10 things i hate about you,m,278,1999,"['comedy', 'romance']",True,False,False,True,"Listen, I want to talk to you about the prom. ..."
3,3,m0,u5,KAT,10 things i hate about you,f,1217,1999,"['comedy', 'romance']",True,False,False,True,Perm? / It's just you. / What? To completely d...
4,4,m0,u6,MANDELLA,10 things i hate about you,f,157,1999,"['comedy', 'romance']",True,False,False,True,William - he asked me to meet him here. / Have...


In [56]:
# add cols for writer data -- add writer data to each char
dialogue_no_missing_ids['num_of_female_writers'] = ""
dialogue_no_missing_ids['total_writers'] = ""

In [57]:
# map movie id for each char to num of female writers and total num of writers 
for i, row in dialogue_no_missing_ids.iterrows():
    mid = dialogue_no_missing_ids.at[i, 'mid']
    match = genderdf[genderdf['mid'] == mid]
    f_num = match.iloc[0, 6] # grabs the num_of_female_writers col value
    total = match.iloc[0, 7] # grabs the total_num_of_writers col value
    dialogue_no_missing_ids.at[i, 'num_of_female_writers'] = f_num
    dialogue_no_missing_ids.at[i, 'total_writers'] = total

In [58]:
# split chars into test / training sets BEFORE doing feature selection

#shuffle chars 
dialogue_no_missing_ids = dialogue_no_missing_ids.sample(frac = 1)
dialogue_no_missing_ids.head()

Unnamed: 0,index,mid,cid,cname,mname,gender,wordcount,year,genres,comedy,thriller,drama,romance,lines,num_of_female_writers,total_writers
208,208,m138,u2130,PAULA,my mother dreams the satan's disciples in new ...,f,303,1998,['short'],False,False,False,False,"Well, bye. And have a safe trip ... and ... P...",0,1
2605,2619,m586,u8633,CLARENCE,true romance,m,4007,1993,"['crime', 'romance', 'thriller']",False,True,False,True,That rhymes. / It is what I want. But I don't ...,0,2
65,65,m108,u1649,HOOPER,jaws,m,1254,1975,['thriller'],False,True,False,False,I can't imagine why. / Just keep kicking. / We...,0,2
1603,1606,m393,u5942,TERRI,hellraiser iii: hell on earth,f,698,1992,['horror'],False,False,False,False,Joey ... ? / What am I looking for? / Here! H...,0,3
363,363,m162,u2499,BOB,pleasantville,m,415,1998,"['comedy', 'drama', 'fantasy']",True,False,True,False,What happened? / Real rain? / Are you alright?...,0,1


In [59]:
# select training data
trainchars = dialogue_no_missing_ids.iloc[0 : 1473, : ]
trainchars.head()

Unnamed: 0,index,mid,cid,cname,mname,gender,wordcount,year,genres,comedy,thriller,drama,romance,lines,num_of_female_writers,total_writers
208,208,m138,u2130,PAULA,my mother dreams the satan's disciples in new ...,f,303,1998,['short'],False,False,False,False,"Well, bye. And have a safe trip ... and ... P...",0,1
2605,2619,m586,u8633,CLARENCE,true romance,m,4007,1993,"['crime', 'romance', 'thriller']",False,True,False,True,That rhymes. / It is what I want. But I don't ...,0,2
65,65,m108,u1649,HOOPER,jaws,m,1254,1975,['thriller'],False,True,False,False,I can't imagine why. / Just keep kicking. / We...,0,2
1603,1606,m393,u5942,TERRI,hellraiser iii: hell on earth,f,698,1992,['horror'],False,False,False,False,Joey ... ? / What am I looking for? / Here! H...,0,3
363,363,m162,u2499,BOB,pleasantville,m,415,1998,"['comedy', 'drama', 'fantasy']",True,False,True,False,What happened? / Real rain? / Are you alright?...,0,1


In [60]:
# select testing data
testchars = dialogue_no_missing_ids.iloc[1473 :, : ]
testchars.head()

Unnamed: 0,index,mid,cid,cname,mname,gender,wordcount,year,genres,comedy,thriller,drama,romance,lines,num_of_female_writers,total_writers
1249,1252,m324,u4863,SAL,dog day afternoon,m,348,1975,"['crime', 'drama']",False,False,True,False,You gonna smoke the cigarette? / You're really...,0,4
2438,2452,m552,u8137,STACY,the limey,m,412,1999,"['crime', 'drama', 'mystery', 'thriller']",False,True,True,False,"Fuck you, Mr. whatever-your-name is. This is a...",0,1
2022,2035,m471,u7039,BELINDA,philadelphia,f,414,1993,['drama'],False,False,True,False,"Thank you. / Answer the question, please. / Do...",0,1
1735,1738,m42,u669,STRASSER,casablanca,m,820,1942,"['drama', 'romance', 'war']",False,False,True,True,"My dear Mademoiselle, perhaps you have already...",1,6
914,914,m263,u3961,DAVID,bean,m,1186,1997,"['comedy', 'family']",True,False,False,False,I'm afraid you don't know the half of it. Sit ...,0,3


In [61]:
# feature selection -- build feature vocabulary ONLY using training data
# using doc frequency (count of how many docs / char lines a word appears in), instead of simply overall word count

vocab = Counter()

stopwords = ['the', 'of', 'and', 'she', 'her', 'he','him']

for char in trainchars['lines']:
    words = re.split('\W', char)
    lowercase = [w.lower() for w in words] 
    unique_words = set([w for w in lowercase if len(w) > 1])  # get rid of one-letter words
    for w in unique_words:
        if w not in stopwords and not w.isdigit(): # don't include numbers and stopwords
            vocab[w.lower()] += 1
        
vocab = vocab.most_common(5000) # returns pairs of keys and counts for words with highest counts


In [62]:
wordfeatures = [x[0] for x in vocab]   
docfreqs = [x[1] for x in vocab]

In [63]:
# create doc term matrix for training data

vectorizer = CountVectorizer(vocabulary = wordfeatures) 

train_char_index = trainchars.index

sparse_counts = vectorizer.fit_transform(trainchars['lines'])
                                                            
train_char_word_counts = pd.DataFrame(sparse_counts.toarray(), index = train_char_index,
                            columns = vectorizer.get_feature_names())
train_char_word_counts.head()

Unnamed: 0,you,to,it,that,what,in,is,me,for,this,do,don,we,know,have,on,not,be,re,with,no,your,can,but,are,was,my,all,like,so,about,there,just,here,they,out,get,if,ll,how,up,think,at,one,want,now,ve,right,got,go,...,activity,evolution,tissue,attacking,dynamite,partial,apologizing,courtroom,dan,yonder,debts,october,harassment,burglary,crowded,footprints,length,edition,variety,ram,temple,shaky,attract,blink,corpse,tossed,reporting,briefcase,musical,entertain,pony,mademoiselle,pigeon,prob,whaddya,pervert,phoenix,sailing,spark,kin,einstein,dried,shore,invent,scent,illusion,tales,roses,mutual,convincing
208,21,13,10,5,2,4,2,2,0,2,3,7,0,1,4,1,1,2,0,2,4,1,0,2,0,1,3,0,3,1,3,1,1,1,0,2,2,0,2,2,1,4,1,0,3,0,2,1,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2605,161,88,97,54,62,39,31,41,30,26,35,26,26,33,21,26,24,28,24,30,21,26,25,29,15,21,29,21,22,24,22,13,25,15,15,13,21,22,15,13,23,13,17,9,11,14,6,1,14,10,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
65,17,24,31,16,5,11,14,5,4,12,2,11,6,2,4,8,4,7,3,11,3,6,12,7,6,11,5,1,4,2,3,13,6,9,5,11,2,7,4,1,6,2,3,10,1,0,8,5,3,4,...,1,1,2,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1603,30,8,30,9,8,5,4,11,5,6,5,7,0,9,5,2,6,1,2,4,5,2,1,3,1,12,4,2,13,4,3,8,7,4,1,4,2,2,5,1,1,2,4,2,2,1,6,6,1,2,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
363,27,12,10,8,5,6,1,2,3,9,9,3,2,3,3,4,5,2,4,1,3,0,1,1,1,2,0,2,1,0,1,0,3,0,3,1,1,1,0,0,1,1,2,2,1,0,1,1,0,0,...,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [64]:
# create doc term matrix for testing data

vectorizer = CountVectorizer(vocabulary = wordfeatures)

test_char_index = testchars.index # preserve indexes to map back to metadata in chars df

sparse_counts = vectorizer.fit_transform(testchars['lines'])
                                                            
test_char_word_counts = pd.DataFrame(sparse_counts.toarray(), index = test_char_index,
                            columns = vectorizer.get_feature_names())
test_char_word_counts.head()
len(test_char_word_counts)

1474

In [65]:
# calculate word freq from word counts and factor out dialogue length
train_dwordcount = trainchars['wordcount']
test_dwordcount = testchars['wordcount']
train_dwordcount


208      303
2605    4007
65      1254
1603     698
363      415
        ... 
512      375
2420    3445
2400     295
1604     626
2385     533
Name: wordcount, Length: 1473, dtype: int64

In [66]:
# gives us relative word freq for each char 
train_wordfreqs = train_char_word_counts.divide(train_dwordcount, axis = 'rows')
test_wordfreqs = test_char_word_counts.divide(test_dwordcount, axis = 'rows')
len(train_wordfreqs)

1473

In [67]:
train_wordfreqs

Unnamed: 0,you,to,it,that,what,in,is,me,for,this,do,don,we,know,have,on,not,be,re,with,no,your,can,but,are,was,my,all,like,so,about,there,just,here,they,out,get,if,ll,how,up,think,at,one,want,now,ve,right,got,go,...,activity,evolution,tissue,attacking,dynamite,partial,apologizing,courtroom,dan,yonder,debts,october,harassment,burglary,crowded,footprints,length,edition,variety,ram,temple,shaky,attract,blink,corpse,tossed,reporting,briefcase,musical,entertain,pony,mademoiselle,pigeon,prob,whaddya,pervert,phoenix,sailing,spark,kin,einstein,dried,shore,invent,scent,illusion,tales,roses,mutual,convincing
208,0.069307,0.042904,0.033003,0.016502,0.006601,0.013201,0.006601,0.006601,0.000000,0.006601,0.009901,0.023102,0.000000,0.003300,0.013201,0.003300,0.003300,0.006601,0.000000,0.006601,0.013201,0.003300,0.000000,0.006601,0.000000,0.003300,0.009901,0.000000,0.009901,0.003300,0.009901,0.003300,0.003300,0.003300,0.000000,0.006601,0.006601,0.000000,0.006601,0.006601,0.003300,0.013201,0.003300,0.000000,0.009901,0.000000,0.006601,0.003300,0.000000,0.003300,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2605,0.040180,0.021962,0.024208,0.013476,0.015473,0.009733,0.007736,0.010232,0.007487,0.006489,0.008735,0.006489,0.006489,0.008236,0.005241,0.006489,0.005990,0.006988,0.005990,0.007487,0.005241,0.006489,0.006239,0.007237,0.003743,0.005241,0.007237,0.005241,0.005490,0.005990,0.005490,0.003244,0.006239,0.003743,0.003743,0.003244,0.005241,0.005490,0.003743,0.003244,0.005740,0.003244,0.004243,0.002246,0.002745,0.003494,0.001497,0.000250,0.003494,0.002496,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65,0.013557,0.019139,0.024721,0.012759,0.003987,0.008772,0.011164,0.003987,0.003190,0.009569,0.001595,0.008772,0.004785,0.001595,0.003190,0.006380,0.003190,0.005582,0.002392,0.008772,0.002392,0.004785,0.009569,0.005582,0.004785,0.008772,0.003987,0.000797,0.003190,0.001595,0.002392,0.010367,0.004785,0.007177,0.003987,0.008772,0.001595,0.005582,0.003190,0.000797,0.004785,0.001595,0.002392,0.007974,0.000797,0.000000,0.006380,0.003987,0.002392,0.003190,...,0.000797,0.000797,0.001595,0.000797,0.000797,0.000797,0.000000,0.00000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1603,0.042980,0.011461,0.042980,0.012894,0.011461,0.007163,0.005731,0.015759,0.007163,0.008596,0.007163,0.010029,0.000000,0.012894,0.007163,0.002865,0.008596,0.001433,0.002865,0.005731,0.007163,0.002865,0.001433,0.004298,0.001433,0.017192,0.005731,0.002865,0.018625,0.005731,0.004298,0.011461,0.010029,0.005731,0.001433,0.005731,0.002865,0.002865,0.007163,0.001433,0.001433,0.002865,0.005731,0.002865,0.002865,0.001433,0.008596,0.008596,0.001433,0.002865,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.001433,0.00000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
363,0.065060,0.028916,0.024096,0.019277,0.012048,0.014458,0.002410,0.004819,0.007229,0.021687,0.021687,0.007229,0.004819,0.007229,0.007229,0.009639,0.012048,0.004819,0.009639,0.002410,0.007229,0.000000,0.002410,0.002410,0.002410,0.004819,0.000000,0.004819,0.002410,0.000000,0.002410,0.000000,0.007229,0.000000,0.007229,0.002410,0.002410,0.002410,0.000000,0.000000,0.002410,0.002410,0.004819,0.004819,0.002410,0.000000,0.002410,0.002410,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00241,0.00241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,0.013333,0.029333,0.045333,0.005333,0.008000,0.016000,0.013333,0.002667,0.010667,0.013333,0.002667,0.016000,0.021333,0.000000,0.010667,0.002667,0.008000,0.010667,0.010667,0.000000,0.005333,0.005333,0.005333,0.002667,0.005333,0.002667,0.008000,0.002667,0.002667,0.000000,0.002667,0.008000,0.005333,0.013333,0.010667,0.002667,0.002667,0.002667,0.010667,0.002667,0.002667,0.000000,0.005333,0.002667,0.000000,0.002667,0.002667,0.002667,0.002667,0.002667,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2420,0.046154,0.026125,0.024383,0.018868,0.009579,0.008999,0.008999,0.018578,0.003193,0.006676,0.007257,0.010740,0.006967,0.012482,0.006967,0.003774,0.004935,0.006386,0.009289,0.005806,0.003483,0.005515,0.006096,0.008708,0.003483,0.003193,0.010740,0.002903,0.011030,0.007257,0.003774,0.004935,0.007257,0.002612,0.001742,0.004935,0.003193,0.002903,0.004935,0.002612,0.003774,0.006676,0.003483,0.002032,0.004644,0.003483,0.001742,0.005225,0.001161,0.001742,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2400,0.030508,0.044068,0.016949,0.033898,0.006780,0.006780,0.006780,0.003390,0.006780,0.006780,0.010169,0.003390,0.006780,0.003390,0.013559,0.013559,0.000000,0.006780,0.006780,0.000000,0.006780,0.003390,0.010169,0.003390,0.000000,0.013559,0.000000,0.000000,0.003390,0.000000,0.003390,0.006780,0.006780,0.003390,0.013559,0.003390,0.000000,0.003390,0.003390,0.000000,0.003390,0.000000,0.006780,0.006780,0.006780,0.003390,0.000000,0.003390,0.003390,0.003390,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1604,0.036741,0.019169,0.015974,0.011182,0.006390,0.009585,0.003195,0.004792,0.006390,0.003195,0.006390,0.006390,0.012780,0.009585,0.001597,0.007987,0.004792,0.009585,0.014377,0.003195,0.001597,0.003195,0.004792,0.006390,0.003195,0.006390,0.006390,0.000000,0.001597,0.003195,0.004792,0.003195,0.003195,0.000000,0.006390,0.006390,0.006390,0.000000,0.006390,0.003195,0.001597,0.001597,0.001597,0.001597,0.001597,0.006390,0.001597,0.003195,0.003195,0.003195,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
test_wordfreqs

Unnamed: 0,you,to,it,that,what,in,is,me,for,this,do,don,we,know,have,on,not,be,re,with,no,your,can,but,are,was,my,all,like,so,about,there,just,here,they,out,get,if,ll,how,up,think,at,one,want,now,ve,right,got,go,...,activity,evolution,tissue,attacking,dynamite,partial,apologizing,courtroom,dan,yonder,debts,october,harassment,burglary,crowded,footprints,length,edition,variety,ram,temple,shaky,attract,blink,corpse,tossed,reporting,briefcase,musical,entertain,pony,mademoiselle,pigeon,prob,whaddya,pervert,phoenix,sailing,spark,kin,einstein,dried,shore,invent,scent,illusion,tales,roses,mutual,convincing
1249,0.083333,0.031609,0.008621,0.028736,0.022989,0.014368,0.002874,0.002874,0.000000,0.000000,0.025862,0.008621,0.020115,0.005747,0.005747,0.000000,0.005747,0.000000,0.008621,0.000000,0.005747,0.008621,0.000000,0.014368,0.005747,0.000000,0.000000,0.002874,0.000000,0.000000,0.014368,0.005747,0.002874,0.011494,0.005747,0.005747,0.005747,0.011494,0.002874,0.002874,0.002874,0.002874,0.000000,0.000000,0.014368,0.014368,0.000000,0.011494,0.005747,0.008621,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002874,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2438,0.019417,0.021845,0.007282,0.007282,0.012136,0.016990,0.014563,0.004854,0.004854,0.014563,0.007282,0.012136,0.004854,0.014563,0.004854,0.007282,0.004854,0.004854,0.004854,0.002427,0.002427,0.007282,0.004854,0.002427,0.002427,0.002427,0.000000,0.002427,0.012136,0.004854,0.007282,0.004854,0.012136,0.000000,0.007282,0.002427,0.000000,0.002427,0.004854,0.000000,0.004854,0.002427,0.002427,0.002427,0.002427,0.002427,0.000000,0.002427,0.002427,0.004854,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.002427,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022,0.072464,0.026570,0.012077,0.019324,0.004831,0.024155,0.009662,0.000000,0.000000,0.004831,0.019324,0.000000,0.002415,0.002415,0.012077,0.014493,0.004831,0.004831,0.002415,0.007246,0.002415,0.021739,0.000000,0.002415,0.000000,0.004831,0.000000,0.000000,0.000000,0.004831,0.000000,0.007246,0.000000,0.000000,0.007246,0.002415,0.000000,0.000000,0.002415,0.000000,0.000000,0.000000,0.014493,0.007246,0.000000,0.000000,0.002415,0.004831,0.000000,0.000000,...,0.002415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1735,0.054878,0.024390,0.010976,0.012195,0.006098,0.035366,0.019512,0.003659,0.012195,0.006098,0.006098,0.002439,0.014634,0.002439,0.012195,0.008537,0.008537,0.010976,0.001220,0.004878,0.003659,0.004878,0.003659,0.003659,0.009756,0.001220,0.004878,0.004878,0.001220,0.002439,0.004878,0.003659,0.001220,0.004878,0.000000,0.000000,0.002439,0.007317,0.000000,0.002439,0.002439,0.002439,0.004878,0.008537,0.001220,0.002439,0.000000,0.000000,0.000000,0.001220,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.007317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
914,0.025295,0.032884,0.022766,0.009275,0.005902,0.012648,0.007589,0.008432,0.005059,0.009275,0.003373,0.002530,0.006745,0.009275,0.008432,0.003373,0.005902,0.005059,0.005059,0.009275,0.006745,0.001686,0.001686,0.005902,0.003373,0.006745,0.005059,0.005059,0.005902,0.002530,0.001686,0.004216,0.006745,0.005059,0.005059,0.002530,0.002530,0.002530,0.005059,0.001686,0.001686,0.002530,0.004216,0.005902,0.000000,0.003373,0.005902,0.006745,0.002530,0.004216,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337,0.022173,0.035477,0.028825,0.013304,0.011086,0.024390,0.006652,0.002217,0.002217,0.000000,0.015521,0.002217,0.004435,0.000000,0.008869,0.004435,0.004435,0.004435,0.000000,0.004435,0.002217,0.000000,0.008869,0.008869,0.002217,0.011086,0.002217,0.004435,0.006652,0.004435,0.004435,0.004435,0.002217,0.000000,0.000000,0.000000,0.002217,0.006652,0.000000,0.000000,0.002217,0.011086,0.008869,0.000000,0.004435,0.004435,0.002217,0.004435,0.006652,0.006652,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2762,0.044082,0.015918,0.017143,0.017551,0.012653,0.011020,0.012245,0.008571,0.005714,0.010612,0.006939,0.013878,0.013469,0.003265,0.003673,0.006531,0.003673,0.003265,0.006939,0.005306,0.002857,0.009796,0.002857,0.004082,0.006122,0.001224,0.007347,0.006939,0.003673,0.004082,0.008980,0.002857,0.008571,0.003673,0.005714,0.003265,0.008163,0.002041,0.005306,0.006122,0.004490,0.001633,0.002449,0.002449,0.002041,0.006122,0.001224,0.004490,0.002041,0.003673,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000408,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198,0.053797,0.041139,0.015823,0.018987,0.018987,0.009494,0.006329,0.003165,0.009494,0.009494,0.018987,0.000000,0.006329,0.003165,0.003165,0.006329,0.000000,0.006329,0.003165,0.006329,0.003165,0.006329,0.009494,0.003165,0.009494,0.006329,0.003165,0.000000,0.000000,0.003165,0.009494,0.006329,0.009494,0.000000,0.018987,0.000000,0.003165,0.003165,0.006329,0.003165,0.003165,0.006329,0.000000,0.006329,0.000000,0.003165,0.003165,0.003165,0.006329,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
524,0.046036,0.026087,0.013811,0.012276,0.013299,0.011253,0.011253,0.008696,0.009719,0.006138,0.005115,0.006650,0.020460,0.001023,0.007161,0.009207,0.003581,0.010742,0.006650,0.001023,0.004604,0.007161,0.008696,0.003069,0.006138,0.002558,0.002558,0.004092,0.001535,0.002046,0.003581,0.002046,0.002558,0.003069,0.004092,0.003069,0.003581,0.005627,0.007161,0.007161,0.003069,0.004092,0.002046,0.001023,0.001535,0.005115,0.004604,0.004092,0.002558,0.001535,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
# add dialogue word count col for each character 
train_wordfreqs['#dwordcount'] = train_dwordcount
test_wordfreqs['#dwordcount'] = test_dwordcount


In [70]:
# get y predictions for test and training sets
train_y = (trainchars['gender'] == 'm').astype(int)
test_y = (testchars['gender'] == 'm').astype(int)

In [71]:
# scale X matrices for training data
# normalizes cols -- col = (col-col.mean) / col.std (like a z score)
trainscaler = StandardScaler()
trainXscaled = trainscaler.fit_transform(train_wordfreqs)
trainXscaled = pd.DataFrame(trainXscaled, columns = train_wordfreqs.columns)
trainXscaled.head()

Unnamed: 0,you,to,it,that,what,in,is,me,for,this,do,don,we,know,have,on,not,be,re,with,no,your,can,but,are,was,my,all,like,so,about,there,just,here,they,out,get,if,ll,how,up,think,at,one,want,now,ve,right,got,go,...,evolution,tissue,attacking,dynamite,partial,apologizing,courtroom,dan,yonder,debts,october,harassment,burglary,crowded,footprints,length,edition,variety,ram,temple,shaky,attract,blink,corpse,tossed,reporting,briefcase,musical,entertain,pony,mademoiselle,pigeon,prob,whaddya,pervert,phoenix,sailing,spark,kin,einstein,dried,shore,invent,scent,illusion,tales,roses,mutual,convincing,#dwordcount
0,1.212411,1.976436,1.271847,0.279978,-0.786616,0.48375,-0.431643,-0.517215,-1.667271,-0.204792,0.487786,2.715818,-1.154011,-0.725752,1.325286,-0.617592,-0.634982,0.133732,-1.293482,0.302918,1.399491,-0.574788,-1.32743,0.314538,-1.215329,-0.571294,0.674715,-1.30319,1.185721,-0.283589,1.346314,-0.309719,-0.394584,-0.231499,-0.975157,0.716872,0.642691,-1.158309,0.583973,0.903522,-0.097846,2.849743,0.016948,-1.01555,1.742021,-1.032475,0.937324,0.05963,-0.93688,0.084936,...,-0.068034,-0.055717,-0.065895,-0.057079,-0.055384,-0.063019,-0.064193,-0.04584,-0.061982,-0.060425,-0.065494,-0.060107,-0.06405,-0.062469,-0.052782,-0.051382,-0.059995,-0.068495,-0.062961,-0.045029,-0.064003,-0.063966,-0.060133,-0.068208,-0.069371,-0.043359,-0.05411,-0.042461,-0.06865,-0.051302,-0.057664,-0.065204,-0.063079,-0.063563,-0.056659,-0.057852,-0.062799,-0.057222,-0.063459,-0.061689,-0.062635,-0.057601,-0.069526,-0.063459,-0.057457,-0.067151,-0.05529,-0.054892,-0.058892,-0.700265
1,-0.465006,-0.36906,0.332345,-0.155204,0.443119,-0.139695,-0.232202,0.026861,0.060747,-0.226154,0.263903,-0.264223,-0.323944,0.250565,-0.292295,0.111391,-0.020298,0.226385,-0.221419,0.534955,-0.206349,-0.049526,0.131117,0.476065,-0.402627,-0.203554,0.172379,0.134627,0.152679,0.428163,0.22635,-0.324752,0.339645,-0.10776,-0.303305,-0.241232,0.27308,0.438179,-0.1503,-0.131872,0.615783,-0.111472,0.340131,-0.326541,-0.224176,0.123725,-0.59492,-0.850334,-0.032623,-0.165402,...,-0.068034,-0.055717,-0.065895,-0.057079,-0.055384,-0.063019,-0.064193,-0.04584,-0.061982,-0.060425,-0.065494,-0.060107,-0.06405,-0.062469,-0.052782,-0.051382,-0.059995,-0.068495,-0.062961,-0.045029,-0.064003,-0.063966,-0.060133,-0.068208,-0.069371,-0.043359,-0.05411,-0.042461,-0.06865,-0.051302,-0.057664,-0.065204,-0.063079,-0.063563,-0.056659,-0.057852,-0.062799,-0.057222,-0.063459,-0.061689,-0.062635,-0.057601,-0.069526,-0.063459,-0.057457,-0.067151,-0.05529,-0.054892,-0.058892,4.356544
2,-1.998209,-0.685203,0.387168,-0.25838,-1.148847,-0.312443,0.369704,-0.908769,-0.931049,0.361379,-1.106687,0.145336,-0.541925,-1.063129,-0.709068,0.086456,-0.660248,-0.110045,-0.865276,0.871405,-0.780962,-0.330246,0.90966,0.056134,-0.176574,0.465619,-0.440553,-1.084411,-0.386163,-0.734969,-0.560323,1.586956,-0.023726,0.851059,-0.259551,1.336685,-0.717909,0.464857,-0.292584,-0.886706,0.336353,-0.602012,-0.294483,1.430717,-0.759358,-1.032475,0.870947,0.264517,-0.317714,0.050548,...,8.044848,8.376143,10.829845,4.574866,6.621476,-0.063019,-0.064193,-0.04584,-0.061982,-0.060425,-0.065494,-0.060107,-0.06405,-0.062469,-0.052782,-0.051382,-0.059995,-0.068495,-0.062961,-0.045029,-0.064003,-0.063966,-0.060133,-0.068208,-0.069371,-0.043359,-0.05411,-0.042461,-0.06865,-0.051302,-0.057664,-0.065204,-0.063079,-0.063563,-0.056659,-0.057852,-0.062799,-0.057222,-0.063459,-0.061689,-0.062635,-0.057601,-0.069526,-0.063459,-0.057457,-0.067151,-0.05529,-0.054892,-0.058892,0.598068
3,-0.303741,-1.545043,2.337494,-0.238987,-0.112907,-0.601594,-0.584411,0.854972,-0.013936,0.175742,-0.037748,0.370759,-1.154011,1.172112,0.098358,-0.717052,0.575478,-1.103196,-0.780617,0.075133,0.181469,-0.646452,-0.992507,-0.269658,-0.904298,2.061287,-0.111764,-0.517089,3.228958,0.35965,-0.07643,1.880721,1.286447,0.447163,-0.718031,0.468521,-0.372597,-0.325133,0.728571,-0.690748,-0.644167,-0.224185,0.85053,-0.136576,-0.191167,-0.558376,1.536416,1.639185,-0.56609,-0.050391,...,-0.068034,-0.055717,-0.065895,-0.057079,-0.055384,11.080128,-0.064193,-0.04584,-0.061982,-0.060425,-0.065494,-0.060107,-0.06405,-0.062469,-0.052782,-0.051382,-0.059995,-0.068495,-0.062961,-0.045029,-0.064003,-0.063966,-0.060133,-0.068208,-0.069371,-0.043359,-0.05411,-0.042461,-0.06865,-0.051302,-0.057664,-0.065204,-0.063079,-0.063563,-0.056659,-0.057852,-0.062799,-0.057222,-0.063459,-0.061689,-0.062635,-0.057601,-0.069526,-0.063459,-0.057457,-0.067151,-0.05529,-0.054892,-0.058892,-0.161
4,0.967848,0.409769,0.320462,0.67923,-0.031564,0.70961,-1.167566,-0.78411,0.001203,2.672308,2.750229,-0.131438,-0.5375,0.05142,0.111687,0.831594,1.364569,-0.292631,0.431722,-0.794384,0.194701,-1.118504,-0.764115,-0.748741,-0.692197,-0.283441,-1.192503,0.018974,-0.568889,-1.157093,-0.555932,-1.195541,0.58695,-1.153107,0.322246,-0.479502,-0.496457,-0.457638,-1.112327,-1.13271,-0.358387,-0.359708,0.537934,0.462821,-0.316377,-1.032475,-0.321017,-0.20604,-0.93688,-0.941782,...,-0.068034,-0.055717,-0.065895,-0.057079,-0.055384,-0.063019,14.851706,5.252219,-0.061982,-0.060425,-0.065494,-0.060107,-0.06405,-0.062469,-0.052782,-0.051382,-0.059995,-0.068495,-0.062961,-0.045029,-0.064003,-0.063966,-0.060133,-0.068208,-0.069371,-0.043359,-0.05411,-0.042461,-0.06865,-0.051302,-0.057664,-0.065204,-0.063079,-0.063563,-0.056659,-0.057852,-0.062799,-0.057222,-0.063459,-0.061689,-0.062635,-0.057601,-0.069526,-0.063459,-0.057457,-0.067151,-0.05529,-0.054892,-0.058892,-0.54736


In [72]:
# scale X matrices for testing data
testscaler = StandardScaler()
testXscaled = testscaler.fit_transform(test_wordfreqs)
testXscaled = pd.DataFrame(testXscaled, columns = test_wordfreqs.columns)

In [73]:
# simple model

# train model
logist = LogisticRegression(C = .1, max_iter = 1000) 
logist.fit(trainXscaled, train_y)

# test model
predictions = logist.predict(testXscaled)
sum(predictions == test_y) / len(predictions)

0.7238805970149254

In [74]:
# more accurate refelction of performance of model
f1_score(test_y, predictions)

0.8082901554404145

In [75]:
# balance of classes -- m vs. f
dialogue_no_missing_ids.gender.value_counts()

m    2006
f     941
Name: gender, dtype: int64

In [76]:
# handling imbalanced classes
# weight classes inversely to frequency -- female char error counts more

# train
logist = LogisticRegression(C = .1, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)

# test
predictions = logist.predict(testXscaled)
f1_score(test_y, predictions)

0.8048309178743962

In [77]:
# cross validation -- select best C parameter using training data
for c_param in [.0001, .001, .01, .1, 1, 10, 100, 1000]:
    logist = LogisticRegression(C = c_param, max_iter = 1000, class_weight = 'balanced') 
    results = cross_validate(logist, trainXscaled, train_y, cv = 5, scoring = 'f1')
    print('C parameter:', c_param)
    print('Mean f1:', np.mean(results['test_score']))
    print()

C parameter: 0.0001
Mean f1: 0.8266359857685966

C parameter: 0.001
Mean f1: 0.826834279467576

C parameter: 0.01
Mean f1: 0.818837787159761

C parameter: 0.1
Mean f1: 0.8164704306801551

C parameter: 1
Mean f1: 0.8104059104810861

C parameter: 10
Mean f1: 0.8090892241260393

C parameter: 100
Mean f1: 0.8113435301124754

C parameter: 1000
Mean f1: 0.8089447534044721



In [78]:
# use best C param to train whole training set
# train
logist = LogisticRegression(C = .0001, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)

# test
predictions = logist.predict(testXscaled)
accuracy = f1_score(test_y, predictions)
print("Accuracy for entire training set is: ", accuracy)

Accuracy for entire training set is:  0.8133535660091047


In [79]:
# need to split test data -- into groups by writer gender (at least 1 female vs. no female writers)

testchars.head()
test_chars_f = testchars[testchars['num_of_female_writers'] >= 1]
test_chars_no_f = testchars[testchars['num_of_female_writers'] == 0]

In [80]:
# test accuracy on chars from movies w/ at least 1 female writer

# separate out test set rows from freqs w/ at least 1 female writer
testfreqs_female = test_wordfreqs.loc[test_chars_f.index, :]
test_y_f = (test_chars_f['gender'][:] == 'm').astype(int)

# scale test data w/ atleast 1 female writer
testscaler = StandardScaler()
testXscaled_f = testscaler.fit_transform(testfreqs_female)
testXscaled_f = pd.DataFrame(testXscaled_f, columns = testfreqs_female.columns)

In [81]:
# train
logist = LogisticRegression(C = .01, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)

# test w/ testXscaled_f
predictions = logist.predict(testXscaled_f)
accuracy = f1_score(test_y_f, predictions)
print("Accuracy for female writers is: ", accuracy)

Accuracy for female writers is:  0.7115384615384616


In [82]:
# test accuracy on chars from movies w/ no female writers

# separate out test set rows from freqs w/ no female writers
testfreqs_no_female =test_wordfreqs.loc[test_chars_no_f.index, :]
test_y_no_f = (test_chars_no_f['gender'][:] == 'm').astype(int)

# scale test data w/ no female writers
testscaler = StandardScaler()
testXscaled_no_f = testscaler.fit_transform(testfreqs_no_female)
testXscaled_no_f = pd.DataFrame(testfreqs_no_female, columns = testfreqs_no_female.columns)

In [83]:
# train
logist = LogisticRegression(C = .01, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)

# test w/ testXscaled_f
predictions = logist.predict(testXscaled_no_f)
accuracy = f1_score(test_y_no_f, predictions)
print("Accuracy for no female writers is: ", accuracy)

Accuracy for no female writers is:  0.8248210023866349


In [84]:
# classifier model predictive features
logist = LogisticRegression(C = .01, max_iter = 1000, class_weight = 'balanced') 
logist.fit(trainXscaled, train_y)
coefficients = [x for x in zip(logist.coef_[0], vectorizer.get_feature_names())]

In [85]:
coefficients.sort()

In [86]:
coefficients[0:25] # female

[(-0.10436997898108503, 'oh'),
 (-0.08440191597718726, 'adorable'),
 (-0.08339475873816246, 'so'),
 (-0.07991281086674329, 'husband'),
 (-0.07746426975406319, 'just'),
 (-0.07424990616155691, 'spoiled'),
 (-0.06986301962694391, 'god'),
 (-0.06738258533100511, 'romantic'),
 (-0.0662854454743539, 'suffer'),
 (-0.06556596063907531, 'decided'),
 (-0.06372918063689294, 'being'),
 (-0.0637119588451735, 'phones'),
 (-0.06311323645682795, 'love'),
 (-0.06289565319962778, 'cute'),
 (-0.062393326674411456, 'sweet'),
 (-0.062131003651875266, 'once'),
 (-0.06179488370378428, 'such'),
 (-0.06082384971831705, 'called'),
 (-0.06068875083154849, 'james'),
 (-0.06060172740257814, 'sick'),
 (-0.06024624182184485, 'pregnant'),
 (-0.05880043156811331, 'silly'),
 (-0.058071097066324684, 'don'),
 (-0.05757598327911398, 'mark'),
 (-0.056997999873984746, 'weird')]

In [87]:
coefficients[-25:] # male

[(0.05137842524919808, 'cigar'),
 (0.05157700146845833, 'appreciate'),
 (0.05159358217773237, 'entirely'),
 (0.051760922131708464, 'customers'),
 (0.05210968294676905, 'families'),
 (0.05233982082855347, 'prepared'),
 (0.05304677499678691, 'study'),
 (0.05328468730005515, 'hell'),
 (0.053372767018829745, 'man'),
 (0.05348531511319047, 'finger'),
 (0.05404394408888702, 'chased'),
 (0.054742503007319046, 'arrive'),
 (0.055031161073793515, 'hit'),
 (0.05517810928545853, 'associate'),
 (0.05650856116214761, 'honor'),
 (0.05703196244731494, 'harbor'),
 (0.05714658979516066, 'gotta'),
 (0.05767164724960621, 'guy'),
 (0.058228159281542596, 'got'),
 (0.058988949302725205, 'gambling'),
 (0.05917882556867377, 'eh'),
 (0.05938360247282615, 'managed'),
 (0.06045291915020122, 'absurd'),
 (0.06670434922017493, 'we'),
 (0.08876727360406224, 'hey')]