# Analyze male and female embeddings

In [1]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
import random
import numpy as np
from gensim.models import KeyedVectors
from tqdm import tqdm_notebook
import csv
import gensim

### Load trained word2vec models

In [2]:
wv_f = KeyedVectors.load("../data/wordvectors_f_final_min50.kv", mmap='r')
wv_m = KeyedVectors.load("../data/wordvectors_m_final_min50.kv", mmap='r')

### Extract idioms

In [3]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

def extract_idiom_groups(filename, nlp):
    idiom_groups = {}
    all_idioms = []
    with open(filename) as f:
        reader = csv.reader(f, delimiter=",")
        for row in reader:
            if row[0]!= "idiom":
                idiom = row[0]

                idiom = idiom.lower().replace("-", " ")
                idiom = idiom.replace(",", "")

                group_num = row[2]
                if group_num not in idiom_groups:
                    idiom_groups[group_num] = []
                idiom_groups[group_num].append(idiom)
                all_idioms.append(idiom)

    return idiom_groups, all_idioms

nlp = English()

idiom_groups, all_idioms = extract_idiom_groups("../data/idioms-definitions-groups-for-embeddings-v2.csv", nlp)

# mapping from idioms to their generic form
idiom2generic = {}
for num in idiom_groups:
    generic = "".join(idiom_groups[num][0].split())
    for idiom in idiom_groups[num]:
        idiom2generic[idiom] = generic

# mapping from generic forms to all possible idioms
generic2idiom = {}
for idiom in idiom2generic:
    generic = idiom2generic[idiom]
    if generic not in generic2idiom:
        generic2idiom[generic] = []
    generic2idiom[generic].append(idiom)

### Show some examples

In [4]:
def extract_topn(wv, word, topn, restrict_vocab=None):
    words = []
    if word in wv:
        for item in wv.similar_by_word(word, topn=topn, restrict_vocab=restrict_vocab):
            words.append(item[0])
        return words
    return None

In [5]:
i = 0
for generic in generic2idiom:
    
    i += 1
    if i > 5:
        break

    if generic not in wv_f.vocab or generic not in wv_m.vocab:
        continue
        
    words_f = extract_topn(wv_f, generic, topn=10, restrict_vocab=10000)    
    words_m = extract_topn(wv_m, generic, topn=10, restrict_vocab=10000)
    
    if words_f and words_m:
        print("\nGeneric form: " + generic2idiom[generic][0])
        print("\nNeighbors in female space:")
        print(words_f)
        print("\nNeighbors in male space:")
        print(words_m)
 
    


Generic form: made inroads

Neighbors in female space:
[u'manipulate', u'convert', u'venture', u'incorporate', u'reform', u'divide', u'evolve', u'sneak', u'compete', u'drag']

Neighbors in male space:
[u'venture', u'merge', u'dominate', u'coalition', u'divided', u'evolve', u'compete', u'integrated', u'divide', u'convert']

Generic form: cut it out

Neighbors in female space:
[u'quit', u'gtfo', u'chillout', u'stop', u'ignore', u'remove', u'eliminate', u'cut', u'delete', u'swallow']

Neighbors in male space:
[u'gtfo', u'quit', u'ignore', u'replace', u'remove', u'cut', u'delete', u'shave', u'skip', u'disappear']

Generic form: hit and miss

Neighbors in female space:
[u'pricey', u'iffy', u'tricky', u'inconsistent', u'disappointing', u'versatile', u'tempting', u'helpful', u'confusing', u'tough']

Neighbors in male space:
[u'pricey', u'disappointing', u'tricky', u'inconsistent', u'frustrating', u'helpful', u'irritating', u'confusing', u'tough', u'sketchy']


### Sort by intersection or by averaged intersection

In [6]:
from tqdm import tqdm_notebook
import operator

def compute_intersections(average_version=False):
    intersections = {}
    
    for generic in tqdm_notebook(generic2idiom):
        
        if generic not in wv_f.vocab or generic not in wv_m.vocab:
            continue

        words_f = extract_topn(wv_f, generic, topn=100, restrict_vocab=10000)
        words_m = extract_topn(wv_m, generic, topn=100, restrict_vocab=10000)
        
        if words_f and words_m: 
            if average_version:
                intersections[generic] = 0.0
                for i in range(100):
                    intersections[generic] += len(set(words_f[:i]).intersection(words_m[:i]))/float(i+1)
                intersections[generic] /= 100
            else:
                intersections[generic] = len(set(words_f).intersection(words_m))

    return intersections



In [7]:
intersections = compute_intersections()
sorted_intersections = sorted(intersections.items(), key=operator.itemgetter(1))

HBox(children=(IntProgress(value=0, max=688), HTML(value=u'')))




In [8]:
average_intersections = compute_intersections(average_version=True)
sorted_average_intersections = sorted(average_intersections.items(), key=operator.itemgetter(1))

HBox(children=(IntProgress(value=0, max=688), HTML(value=u'')))




### Print most different and least different idioms across genders

In [9]:
def print_details(sorted_intersections):
    
    for idiom, intersection in sorted_intersections:
        print(generic2idiom[idiom][0], intersection)
        words_f = extract_topn(wv_f, idiom, topn=10, restrict_vocab=10000)
        words_m = extract_topn(wv_m, idiom, topn=10, restrict_vocab=10000)
        only_f = set(words_f) - set(words_m)
        only_m = set(words_m) - set(words_f)
        both = set(words_f) - set(only_f)
        print("\nNN of both:\n")
        print(list(both))
        print("\nNN of only female:\n")
        print(list(only_f))
        print("\nNN of only male:\n")
        print(list(only_m))   


In [10]:
print("Idioms used differently\n========================\n")
print_details(sorted_intersections[:2])
print("\nIdioms used similarly\n======================")
print_details(sorted_intersections[-2:][::-1])

Idioms used differently

('hard as nails', 0)

NN of both:

[]

NN of only female:

[u'foundation', u'eyeshadow', u'lipstick', u'coat', u'revlon', u'eyeliner', u'nail', u'polish', u'sally', u'gel']

NN of only male:

[u'fantastic', u'tough', u'brilliant', u'phenomenal', u'dope', u'gorgeous', u'brutal', u'hilarious', u'badass', u'adorable']
('catnap', 3)

NN of both:

[]

NN of only female:

[u'sip', u'naps', u'bite', u'kitty', u'bath', u'nap', u'cuddle', u'cat', u'snuggle', u'fetch']

NN of only male:

[u'cleric', u'wizard', u'magic', u'spell', u'breath', u'arcane', u'bard', u'initiative', u'feat', u'spells']

Idioms used similarly
('give me five', 81)

NN of both:

[u'20', u'thirty', u'ten', u'twenty', u'fifty', u'five', u'fifteen']

NN of only female:

[u'30', u'15', u'45']

NN of only male:

[u'twelve', u'seven', u'eight']
('lo and behold', 75)

NN of both:

[u'thankfully', u'fortunately', u'whoops', u'yesterday', u'welp', u'luckily', u'bam']

NN of only female:

[u'finally', u'boom

In [11]:
print("Idioms used differently\n========================\n")
print_details(sorted_average_intersections[:2])
print("\nIdioms used similarly\n======================")
print_details(sorted_average_intersections[-2:][::-1])

Idioms used differently

('hard as nails', 0.0)

NN of both:

[]

NN of only female:

[u'foundation', u'eyeshadow', u'lipstick', u'coat', u'revlon', u'eyeliner', u'nail', u'polish', u'sally', u'gel']

NN of only male:

[u'fantastic', u'tough', u'brilliant', u'phenomenal', u'dope', u'gorgeous', u'brutal', u'hilarious', u'badass', u'adorable']
('nothing doing', 0.014298110795516359)

NN of both:

[]

NN of only female:

[u'town', u'36', u'bust', u'tech', u'72', u'branch', u'cs', u'rural', u'farming', u'local']

NN of only male:

[u'gg', u'nevermind', u'yesterday', u":'(", u'0', u'awww', u'bam', u'welp', u'ew', u'bye']

Idioms used similarly
('give me five', 0.7652691795756873)

NN of both:

[u'20', u'thirty', u'ten', u'twenty', u'fifty', u'five', u'fifteen']

NN of only female:

[u'30', u'15', u'45']

NN of only male:

[u'twelve', u'seven', u'eight']
('on the hook', 0.7101561409263808)

NN of both:

[u'paying', u'owed', u'sued', u'eligible', u'responsible', u'paid', u'punished', u'charge