In [1]:
import os
import json

import torchtext
import torch
import pandas as pd

import vsm
from utils import *

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

# Preprocessing
Convert dictionary data in JSON format to TSV format so that it can be processed by `torchtext`

In [2]:
# Load dictionary dataset
data_home = 'data'
raw_data_path = os.path.join(data_home, 'glove6B_dictionary.json')
data_path = os.path.join(data_home, 'glove6B_dictionary.tsv')

with open(raw_data_path) as f:
    dictionary = json.load(f)

print("Number of words:", len(dictionary.keys()))

Number of words: 400000


In [3]:
tabularize_dictionary(raw_data_path, data_path, max_defs=5)

# Data Loader
We will use `torchtext` library to preprocess and load dataset.
First, define `Field`s, that contains information on how you want the data to be preprocessed.

In [3]:
import spacy

spacy_en = spacy.load('en')

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

TEXT = torchtext.data.Field(sequential=True,
                            tokenize=tokenizer,
                            lower=True,
                            include_lengths=True)
LABEL = torchtext.data.Field(sequential=False,
                             lower=True)

POS = torchtext.data.Field(sequential=False,
                             lower=False)

TAG = torchtext.data.Field(sequential=False,
                             lower=False)

In [4]:
dataset= torchtext.data.TabularDataset(data_path,
                                       format='tsv',
                                       fields=[('label', LABEL),
                                               ('pos', POS),
                                               ('tag', TAG),
                                               ('text', TEXT)],
                                       skip_header=False)

Build vocab for each field and load pretrained word vectors

In [5]:
unk_init = lambda x: torch.nn.init.normal_(x, mean=0, std=0.5)
TEXT.build_vocab(dataset, vectors="glove.6B.100d", min_freq=2,
                 unk_init=unk_init)
LABEL.build_vocab(dataset, vectors="glove.6B.100d",
                  unk_init=unk_init)
POS.build_vocab(dataset)
TAG.build_vocab(dataset)

print(TEXT.vocab.vectors.size())
print(LABEL.vocab.vectors.size())

torch.Size([54621, 100])
torch.Size([118045, 100])


# Train and Test Splits

In [12]:
import random

random.seed(0)

train_labels = set()
test_labels = set()

ratio = 1.0

for label in LABEL.vocab.stoi.keys():
    if label in TEXT.vocab.stoi.keys() or not label.isalpha():
        train_labels.add(label)
    else:
        p = random.random()
        if p < ratio:
            train_labels.add(label)
        else:
            test_labels.add(label)
        
    
assert len(train_labels) + len(test_labels) == len(LABEL.vocab.stoi.keys())

print("Train labels:", len(train_labels))
print("Test labels:", len(test_labels))

Train labels: 118045
Test labels: 0


In [177]:
import csv

train_fn = 'glove6B_dictionary_train.tsv'
test_fn = 'glove6B_dictionary_test.tsv'

train_path = os.path.join(data_home, train_fn)
test_path = os.path.join(data_home, test_fn)

with open(data_path) as data_file,\
    open(train_path, 'w') as train_file,\
    open(test_path, 'w') as test_file:
    
    reader = csv.reader(data_file, delimiter='\t')
    train_writer = csv.writer(train_file, delimiter='\t')
    test_writer = csv.writer(test_file, delimiter='\t')

    for row in reader:
        label = row[0]
        if label in train_labels:
            train_writer.writerow(row)
        else:
            test_writer.writerow(row)

In [163]:
train_set, test_set = torchtext.data.TabularDataset.splits(
        path=data_home, skip_header=False,
        train=train_fn, test=test_fn, format='tsv',
        fields=[('label', LABEL), ('pos', POS), ('tag', TAG), ('text', TEXT)])

In [164]:
train_iter, test_iter = torchtext.data.Iterator.splits(
                                    (train_set, test_set),
                                    batch_sizes=(256, 256),
                                    shuffle=True,
                                    sort_within_batch=True,
                                    sort_key=lambda x: len(x.text),
                                    repeat=False)

# Model

In [123]:
LABEL.build_vocab(train_set, vectors="glove.6B.100d",
                  unk_init=unk_init)

TEXT.vocab = LABEL.vocab

print(TEXT.vocab.vectors.size())
print(LABEL.vocab.vectors.size())

torch.Size([118038, 100])
torch.Size([118038, 100])


In [55]:
from models.DictEncoder import DictEncoder
import torch.nn as nn

max_len = 50
n_layers = 1
dropout_p = 0.1
rnn_cell = 'gru'
bidirectional = True
train_embedding = True

vocab_size, embed_dim = TEXT.vocab.vectors.size()
hidden_size = embed_dim
pos_size = len(POS.vocab)
tag_size = len(TAG.vocab)

# Dictionary encoder model
model = DictEncoder(vocab_size, pos_size, tag_size, max_len, hidden_size,
                    pretrained_embedding=TEXT.vocab.vectors,
                    input_dropout_p=dropout_p,
                    train_embedding=train_embedding,
                    n_layers=n_layers, bidirectional=bidirectional,
                    rnn_cell=rnn_cell, variable_lengths=True)


# GloVe embedding lookup
glove = nn.Embedding(len(LABEL.vocab), embed_dim)
glove.weight.data.copy_(LABEL.vocab.vectors)
for param in glove.parameters():
    param.requires_grad = False
        
if torch.cuda.is_available:
    model = model.cuda()
    glove = glove.cuda()

# Training

Create a dataset iterator.

In [56]:
def evaluate(model, data_iter):
    """ Compute model's loss on data_iter """
    model.eval()
    val_loss = 0
    for data in data_iter:
        # Setup data batch
        with torch.no_grad():
            x, x_lengths = data.text
            x = Variable(x.t())
            y = Variable(data.label)
            pos = Variable(data.pos)
            tag = Variable(data.tag)

            emb_def = model(x, x_lengths, pos, tag)
            emb_word = model.encoder.embedding(y)

            val_loss += torch.norm((emb_def - emb_word), dim=1).mean()

    val_loss = val_loss / len(data_iter)
    return val_loss

Training loop

In [None]:
from torch.autograd import Variable
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR

do_eval = True
lr = 0.001
alpha = 1.0 # Regularization strength
num_epochs = 30
log_step = 500


params = list(model.parameters())
params_to_train = [param for param in params if param.requires_grad]

'''
optimizer = torch.optim.Adam([
    {'params': model.encoder.embedding.parameters(), 'lr': 1e-4},
    {'params': model.encoder.rnn.parameters()},
    {'params': model.linear.parameters()},
    {'params': model.embed_pos.parameters()},
    {'params': model.embed_tag.parameters()}], lr=lr)
'''
optimizer = torch.optim.Adam(params_to_train, lr=lr)
#scheduler = StepLR(optimizer, step_size=3, gamma=0.8)

for epoch in range(num_epochs):
    model.train()
    #scheduler.step()
    for i, data in enumerate(train_iter):
        # Setup data batch
        x, x_lengths = data.text
        x = Variable(x.t())
        y = Variable(data.label)
        pos = Variable(data.pos)
        tag = Variable(data.tag)

        # Forward
        optimizer.zero_grad()

        emb_def = model(x, x_lengths, pos, tag)
        emb_word = model.encoder.embedding(y)
        emb_glove = glove(y)

        # Compute loss
        loss = torch.norm((emb_def - emb_word), dim=1).mean()
        
        # Regularization
        reg = torch.norm((emb_glove - emb_word), dim=1).mean()
        
        loss += (alpha * reg)

        # Backward
        loss.backward()
        optimizer.step()

        if (i + 1) % log_step == 0:
            print("Epoch [%d/%d], Batch [%d/%d], Loss: %.4f"
                  % (epoch+1, num_epochs, i+1, len(train_iter), loss))

    if do_eval:
        print("==================================")
        train_loss = evaluate(model, train_iter).data.item()
        print("Training loss:", train_loss)
        print("==================================")


# Extracting word embeddings

## Learned embedding

In [124]:
model.eval()
emb_weight = model.encoder.embedding.weight.data.cpu()

In [125]:
LABEL.build_vocab(train_set)
TEXT.vocab = LABEL.vocab

emb_lookup = {}
for w in TEXT.vocab.stoi.keys():
    emb_lookup[w] = emb_weight[TEXT.vocab.stoi[w]].numpy()
    
emb_df = pd.DataFrame.from_dict(emb_lookup, 'index')
emb_df.index

Index(['freaking', 'saints', 'maneuvering', 'broadens', 'taskforce', 'baracoa',
       'patinated', 'egghead', 'certificate', 'prosody',
       ...
       'fauvism', 'islamic', 'liberated', 'un-christian', 'storing',
       'vouching', 'catch-up', 'cat-and-mouse', 'freelances', 'hammon'],
      dtype='object', length=118038)

In [60]:
vsm.neighbors('eat', emb_df, distfunc=vsm.euclidean).head(10)

eat        0.000000
ate        3.103765
eating     3.135178
eaten      3.178984
consume    3.471366
meal       3.702084
feed       3.755443
eats       3.798775
food       3.835994
drink      3.842367
dtype: float64

## Embeddings for new words

In [165]:
def compute_embedding(model, data_iter):
#model.load_state_dict(torch.load(model_path))
    model.eval()
    emb_lookup = defaultdict(list)
    for i, data in enumerate(data_iter):
        x, x_lengths = data.text
        x = Variable(x.t())
        y = Variable(data.label)
        pos = Variable(data.pos)
        tag = Variable(data.tag)

        emb_def = model(x, x_lengths, pos, tag)
        for j in range(len(emb_def)):
            word = LABEL.vocab.itos[y[j].item()]
            emb_lookup[word].append(emb_def[j].data.cpu().numpy())

    for word in emb_lookup:
        emb_lookup[word] = np.mean(emb_lookup[word], axis=0)
        
    emb_df = pd.DataFrame.from_dict(emb_lookup, 'index')
    
    return emb_df

In [166]:
LABEL.build_vocab(test_set)
new_emb_df = compute_embedding(model, test_iter)

  return Variable(arr, volatile=not train)
  return Variable(arr, volatile=not train), lengths


In [167]:
print(len(LABEL.vocab), len(TEXT.vocab))

10 118038


In [168]:
emb_all = pd.concat([new_emb_df, emb_df])
emb_all.index

Index(['polysexual', 'lauzon', 'unfriend', 'gaziantep', 'hootch', 'bromance',
       'hamza', 'hwan', 'matcha', 'freaking',
       ...
       'fauvism', 'islamic', 'liberated', 'un-christian', 'storing',
       'vouching', 'catch-up', 'cat-and-mouse', 'freelances', 'hammon'],
      dtype='object', length=118047)

In [214]:
vsm.neighbors('working', emb_all, distfunc=vsm.cosine).head(10)

working    0.000000
worked     0.117505
work       0.118867
well       0.239718
doing      0.244455
trying     0.249324
job        0.266254
help       0.271003
now        0.273245
better     0.273366
dtype: float64

In [215]:
vsm.neighbors('working', glove_df, distfunc=vsm.cosine).head(10)

working    0.000000
worked     0.116819
work       0.122640
well       0.241180
doing      0.250419
trying     0.251308
job        0.269225
help       0.271538
better     0.271646
done       0.276110
dtype: float64

## Baseline word embeddings

### Gigaword PPMI

In [31]:
giga20 = pd.read_csv(
    os.path.join('vsmdata', 'gigaword_window20-flat.csv.gz'), index_col=0)

giga20_ppmi = vsm.pmi(giga20)

In [32]:
vsm.neighbors('eat', giga20_ppmi, distfunc=vsm.euclidean).head()

eat       0.000000
meal     28.318924
food     31.797789
eats     32.284420
drink    32.477807
dtype: float64

### IMDB PPMI

In [33]:
imdb20 = pd.read_csv(
    os.path.join('vsmdata', "imdb_window20-flat.csv.gz"), index_col=0)

imdb20_ppmi = vsm.pmi(imdb20)

In [34]:
vsm.neighbors('eat', imdb20_ppmi, distfunc=vsm.euclidean).head()

eat        0.000000
eating    30.602471
food      35.825917
meat      37.316129
eats      39.365628
dtype: float64

### GloVe

In [35]:
glove_lookup = glove2dict(os.path.join('vsmdata', 'glove.6B.100d.txt'))

glove_df = pd.DataFrame.from_dict(glove_lookup, 'index')

In [36]:
vsm.neighbors('eat', glove_df, distfunc=vsm.euclidean).head(10)

eat        0.000000
eating     3.634153
ate        3.774174
eaten      3.831825
consume    4.231454
meal       4.556170
eats       4.573135
feed       4.645383
drink      4.683227
prefer     4.696516
dtype: float64

# Experiments

## 1. Word Similarity

In [77]:
from word_sim import *

### Baselines

In [78]:
full_word_similarity_evaluation(imdb20_ppmi, verbose=True)

wordsim353_reader
Evaluation vocab: 418 of 437
Spearman r: 0.469
mturk287_reader
Evaluation vocab: 499 of 499
Spearman r: 0.599
mturk771_reader
Evaluation vocab: 1,113 of 1,113
Spearman r: 0.462
men_reader
Evaluation vocab: 751 of 751
Spearman r: 0.572
Mean Spearman r: 0.525


{'men_reader': 0.5724487594295152,
 'mturk287_reader': 0.5986597600532505,
 'mturk771_reader': 0.4615212813179131,
 'wordsim353_reader': 0.46888766456156583}

In [79]:
full_word_similarity_evaluation(giga20_ppmi, verbose=True)

wordsim353_reader
Evaluation vocab: 418 of 437
Spearman r: 0.575
mturk287_reader
Evaluation vocab: 499 of 499
Spearman r: 0.662
mturk771_reader
Evaluation vocab: 1,113 of 1,113
Spearman r: 0.509
men_reader
Evaluation vocab: 751 of 751
Spearman r: 0.642
Mean Spearman r: 0.597


{'men_reader': 0.6417502972889049,
 'mturk287_reader': 0.6617722569937702,
 'mturk771_reader': 0.5091371613882787,
 'wordsim353_reader': 0.5746437768533685}

In [80]:
full_word_similarity_evaluation(glove_df, verbose=True)

wordsim353_reader
Evaluation vocab: 416 of 437
Spearman r: 0.528
mturk287_reader
Evaluation vocab: 499 of 499
Spearman r: 0.619
mturk771_reader
Evaluation vocab: 1,113 of 1,113
Spearman r: 0.581
men_reader
Evaluation vocab: 751 of 751
Spearman r: 0.693
Mean Spearman r: 0.605


{'men_reader': 0.6931865411595562,
 'mturk287_reader': 0.6193399527127166,
 'mturk771_reader': 0.5805168561442647,
 'wordsim353_reader': 0.5275901831851209}

### Ours

In [81]:
full_word_similarity_evaluation(emb_df, verbose=True)

wordsim353_reader
Evaluation vocab: 413 of 437
Spearman r: 0.558
mturk287_reader
Evaluation vocab: 488 of 499
Spearman r: 0.650
mturk771_reader
Evaluation vocab: 1,106 of 1,113
Spearman r: 0.612
men_reader
Evaluation vocab: 747 of 751
Spearman r: 0.719
Mean Spearman r: 0.635


{'men_reader': 0.7187011596033591,
 'mturk287_reader': 0.6501970831559337,
 'mturk771_reader': 0.6118669648598645,
 'wordsim353_reader': 0.5581769646501574}

In [95]:
full_word_similarity_evaluation(emb_all, verbose=True)

wordsim353_reader
Evaluation vocab: 416 of 437
Spearman r: 0.557
mturk287_reader
Evaluation vocab: 492 of 499
Spearman r: 0.643
mturk771_reader
Evaluation vocab: 1,110 of 1,113
Spearman r: 0.614
men_reader
Evaluation vocab: 751 of 751
Spearman r: 0.714
Mean Spearman r: 0.632


{'men_reader': 0.7140169843882098,
 'mturk287_reader': 0.6430348815003137,
 'mturk771_reader': 0.6138828986132021,
 'wordsim353_reader': 0.5572153263297986}

## 2. Word Analogy

In [95]:
from word_analogy import *

In [96]:
x = analogy_completion("presiden", "great", "bad", emb_df)
x.head()

terrible    0.296863
huge        0.320579
big         0.333326
worst       0.334916
caused      0.360995
dtype: float64

In [97]:
x = analogy_completion("good", "great", "bad", emb_all)
x.head()

terrible    0.296863
huge        0.320579
big         0.333326
worst       0.334916
caused      0.360995
dtype: float64

### IMDB20

In [99]:
analogy_evaluation(imdb20_ppmi, src_filename='family.txt', verbose=False)

(0.5446665665352782, defaultdict(int, {False: 297, True: 209}))

In [100]:
analogy_evaluation(imdb20_ppmi, src_filename='gram1-adjective-to-adverb.txt', verbose=False)

(0.14621311421118469, defaultdict(int, {False: 904, True: 88}))

In [101]:
analogy_evaluation(imdb20_ppmi, src_filename='gram2-opposite.txt', verbose=False)

(0.11918977012507004, defaultdict(int, {False: 756, True: 56}))

In [102]:
analogy_evaluation(imdb20_ppmi, src_filename='gram3-comparative.txt', verbose=False)

(0.4214983178751905, defaultdict(int, {False: 968, True: 364}))

In [103]:
analogy_evaluation(imdb20_ppmi, src_filename='gram5-present-participle.txt', verbose=False)

(0.4218615748000484, defaultdict(int, {False: 713, True: 343}))

In [104]:
analogy_evaluation(imdb20_ppmi, src_filename='gram8-plural.txt', verbose=False)

(0.6575181494624674, defaultdict(int, {False: 609, True: 723}))

In [105]:
analogy_evaluation(imdb20_ppmi, src_filename='capital-world.txt', verbose=False)

(0.4578007178981834, defaultdict(int, {False: 2066, True: 1224}))

### GIGA20

In [106]:
analogy_evaluation(giga20_ppmi, src_filename='family.txt', verbose=False)

(0.6659846219739627, defaultdict(int, {False: 227, True: 279}))

In [107]:
analogy_evaluation(giga20_ppmi, src_filename='gram1-adjective-to-adverb.txt', verbose=False)

(0.21077301283042899, defaultdict(int, {False: 859, True: 133}))

In [108]:
analogy_evaluation(giga20_ppmi, src_filename='gram2-opposite.txt', verbose=False)

(0.28054148855931976, defaultdict(int, {False: 645, True: 167}))

In [109]:
analogy_evaluation(giga20_ppmi, src_filename='gram3-comparative.txt', verbose=False)

(0.5692569712312698, defaultdict(int, {False: 726, True: 606}))

In [110]:
analogy_evaluation(giga20_ppmi, src_filename='gram5-present-participle.txt', verbose=False)

(0.6402012538987079, defaultdict(int, {False: 456, True: 600}))

In [111]:
analogy_evaluation(giga20_ppmi, src_filename='gram8-plural.txt', verbose=False)

(0.6682194201629682, defaultdict(int, {False: 549, True: 783}))

In [112]:
analogy_evaluation(giga20_ppmi, src_filename='capital-world.txt', verbose=False)

(0.7390063846387628, defaultdict(int, {False: 1471, True: 3053}))

### Glove

In [256]:
analogy_evaluation(glove_df.loc[giga20.index], src_filename='family.txt', verbose=False)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


boy is to girl as brother is to daughter (gold: sister at rank 6)
boy is to girl as brothers is to sisters (gold: sisters at rank 1)
boy is to girl as dad is to mom (gold: mom at rank 1)
boy is to girl as father is to mother (gold: mother at rank 1)
boy is to girl as grandfather is to grandmother (gold: grandmother at rank 1)
boy is to girl as grandpa is to grandma (gold: grandma at rank 1)
boy is to girl as grandson is to granddaughter (gold: granddaughter at rank 1)
boy is to girl as groom is to bride (gold: bride at rank 1)
boy is to girl as he is to she (gold: she at rank 1)
boy is to girl as his is to her (gold: her at rank 1)
boy is to girl as husband is to wife (gold: wife at rank 1)
boy is to girl as king is to queen (gold: queen at rank 1)
boy is to girl as man is to woman (gold: woman at rank 1)
boy is to girl as nephew is to niece (gold: niece at rank 1)
boy is to girl as policeman is to soldier (gold: policewoman at rank 5)
boy is to girl as prince is to princess (gold: pri

grandfather is to grandmother as policeman is to wounded (gold: policewoman at rank 7)
grandfather is to grandmother as prince is to princess (gold: princess at rank 1)
grandfather is to grandmother as son is to daughter (gold: daughter at rank 1)
grandfather is to grandmother as sons is to daughters (gold: daughters at rank 1)
grandfather is to grandmother as stepbrother is to stepsister (gold: stepsister at rank 1)
grandfather is to grandmother as stepfather is to stepdaughter (gold: stepmother at rank 2)
grandfather is to grandmother as stepson is to stepdaughter (gold: stepdaughter at rank 1)
grandfather is to grandmother as uncle is to aunt (gold: aunt at rank 1)
grandfather is to grandmother as boy is to girl (gold: girl at rank 1)
grandfather is to grandmother as brother is to wife (gold: sister at rank 4)
grandfather is to grandmother as brothers is to sisters (gold: sisters at rank 1)
grandfather is to grandmother as dad is to mom (gold: mom at rank 1)
grandfather is to grandm

his is to her as father is to mother (gold: mother at rank 1)
his is to her as grandfather is to grandmother (gold: grandmother at rank 1)
his is to her as grandpa is to grandma (gold: grandma at rank 1)
his is to her as grandson is to granddaughter (gold: granddaughter at rank 1)
his is to her as groom is to bride (gold: bride at rank 1)
his is to her as he is to she (gold: she at rank 1)
husband is to wife as king is to queen (gold: queen at rank 1)
husband is to wife as man is to woman (gold: woman at rank 1)
husband is to wife as nephew is to cousin (gold: niece at rank 2)
husband is to wife as policeman is to wounded (gold: policewoman at rank 44)
husband is to wife as prince is to princess (gold: princess at rank 1)
husband is to wife as son is to brother (gold: daughter at rank 2)
husband is to wife as sons is to daughters (gold: daughters at rank 1)
husband is to wife as stepbrother is to stepsister (gold: stepsister at rank 1)
husband is to wife as stepfather is to niece (gold

prince is to princess as stepbrother is to stepsister (gold: stepsister at rank 1)
prince is to princess as stepfather is to stepmother (gold: stepmother at rank 1)
prince is to princess as stepson is to stepdaughter (gold: stepdaughter at rank 1)
prince is to princess as uncle is to aunt (gold: aunt at rank 1)
prince is to princess as boy is to girl (gold: girl at rank 1)
prince is to princess as brother is to sister (gold: sister at rank 1)
prince is to princess as brothers is to sisters (gold: sisters at rank 1)
prince is to princess as dad is to mom (gold: mom at rank 1)
prince is to princess as father is to mother (gold: mother at rank 1)
prince is to princess as grandfather is to grandmother (gold: grandmother at rank 1)
prince is to princess as grandpa is to grandma (gold: grandma at rank 1)
prince is to princess as grandson is to granddaughter (gold: granddaughter at rank 1)
prince is to princess as groom is to bride (gold: bride at rank 1)
prince is to princess as he is to she

stepson is to stepdaughter as father is to mother (gold: mother at rank 1)
stepson is to stepdaughter as grandfather is to grandmother (gold: grandmother at rank 1)
stepson is to stepdaughter as grandpa is to grandma (gold: grandma at rank 1)
stepson is to stepdaughter as grandson is to granddaughter (gold: granddaughter at rank 1)
stepson is to stepdaughter as groom is to bride (gold: bride at rank 1)
stepson is to stepdaughter as he is to she (gold: she at rank 1)
stepson is to stepdaughter as his is to her (gold: her at rank 1)
stepson is to stepdaughter as husband is to wife (gold: wife at rank 1)
stepson is to stepdaughter as king is to queen (gold: queen at rank 1)
stepson is to stepdaughter as man is to woman (gold: woman at rank 1)
stepson is to stepdaughter as nephew is to niece (gold: niece at rank 1)
stepson is to stepdaughter as policeman is to wounded (gold: policewoman at rank 9)
stepson is to stepdaughter as prince is to princess (gold: princess at rank 1)
stepson is to 

(0.879974124835785, defaultdict(int, {False: 91, True: 415}))

In [257]:
analogy_evaluation(glove_df.loc[giga20.index], src_filename='gram1-adjective-to-adverb.txt', verbose=False)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


(0.421351164496735, defaultdict(int, {False: 678, True: 314}))

In [258]:
analogy_evaluation(glove_df.loc[giga20.index], src_filename='gram2-opposite.txt', verbose=False)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


(0.45090781304319216, defaultdict(int, {False: 523, True: 289}))

In [140]:
analogy_evaluation(glove_df.loc[giga20.index], src_filename='gram3-comparative.txt', verbose=False)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


(0.8711419803625379, defaultdict(int, {False: 241, True: 1091}))

In [136]:
analogy_evaluation(glove_df.loc[giga20.index], src_filename='gram5-present-participle.txt', verbose=False)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


(0.8153031675311048, defaultdict(int, {False: 272, True: 784}))

In [139]:
analogy_evaluation(glove_df.loc[giga20.index], src_filename='gram8-plural.txt', verbose=False)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


(0.8470770712410322, defaultdict(int, {False: 301, True: 1031}))

In [138]:
analogy_evaluation(glove_df.loc[giga20.index], src_filename='capital-world.txt', verbose=False)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


(0.934559239137541, defaultdict(int, {False: 426, True: 4098}))

### Ours

In [100]:
analogy_evaluation(emb_df.loc[giga20.index], src_filename='family.txt', verbose=True)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


boy is to girl as brother is to daughter (gold: sister at rank 6)
boy is to girl as brothers is to sisters (gold: sisters at rank 1)
boy is to girl as dad is to mom (gold: mom at rank 1)
boy is to girl as father is to mother (gold: mother at rank 1)
boy is to girl as grandfather is to grandmother (gold: grandmother at rank 1)
boy is to girl as grandpa is to grandma (gold: grandma at rank 1)
boy is to girl as grandson is to granddaughter (gold: granddaughter at rank 1)
boy is to girl as groom is to bride (gold: bride at rank 1)
boy is to girl as he is to she (gold: she at rank 1)
boy is to girl as his is to her (gold: her at rank 1)
boy is to girl as husband is to wife (gold: wife at rank 1)
boy is to girl as king is to queen (gold: queen at rank 1)
boy is to girl as man is to woman (gold: woman at rank 1)
boy is to girl as nephew is to niece (gold: niece at rank 1)
boy is to girl as policeman is to policewoman (gold: policewoman at rank 1)
boy is to girl as prince is to princess (gold:

grandfather is to grandmother as policeman is to wounded (gold: policewoman at rank 7)
grandfather is to grandmother as prince is to princess (gold: princess at rank 1)
grandfather is to grandmother as son is to daughter (gold: daughter at rank 1)
grandfather is to grandmother as sons is to daughters (gold: daughters at rank 1)
grandfather is to grandmother as stepbrother is to stepsister (gold: stepsister at rank 1)
grandfather is to grandmother as stepfather is to stepdaughter (gold: stepmother at rank 2)
grandfather is to grandmother as stepson is to stepdaughter (gold: stepdaughter at rank 1)
grandfather is to grandmother as uncle is to aunt (gold: aunt at rank 1)
grandfather is to grandmother as boy is to girl (gold: girl at rank 1)
grandfather is to grandmother as brother is to mother (gold: sister at rank 4)
grandfather is to grandmother as brothers is to sisters (gold: sisters at rank 1)
grandfather is to grandmother as dad is to mom (gold: mom at rank 1)
grandfather is to gran

his is to her as dad is to mom (gold: mom at rank 1)
his is to her as father is to mother (gold: mother at rank 1)
his is to her as grandfather is to mother (gold: grandmother at rank 2)
his is to her as grandpa is to grandma (gold: grandma at rank 1)
his is to her as grandson is to granddaughter (gold: granddaughter at rank 1)
his is to her as groom is to bride (gold: bride at rank 1)
his is to her as he is to she (gold: she at rank 1)
husband is to wife as king is to queen (gold: queen at rank 1)
husband is to wife as man is to woman (gold: woman at rank 1)
husband is to wife as nephew is to niece (gold: niece at rank 1)
husband is to wife as policeman is to wounded (gold: policewoman at rank 11)
husband is to wife as prince is to princess (gold: princess at rank 1)
husband is to wife as son is to daughter (gold: daughter at rank 1)
husband is to wife as sons is to daughters (gold: daughters at rank 1)
husband is to wife as stepbrother is to stepsister (gold: stepsister at rank 1)
hu

prince is to princess as sons is to daughters (gold: daughters at rank 1)
prince is to princess as stepbrother is to stepsister (gold: stepsister at rank 1)
prince is to princess as stepfather is to stepmother (gold: stepmother at rank 1)
prince is to princess as stepson is to stepdaughter (gold: stepdaughter at rank 1)
prince is to princess as uncle is to aunt (gold: aunt at rank 1)
prince is to princess as boy is to girl (gold: girl at rank 1)
prince is to princess as brother is to sister (gold: sister at rank 1)
prince is to princess as brothers is to sisters (gold: sisters at rank 1)
prince is to princess as dad is to mom (gold: mom at rank 1)
prince is to princess as father is to mother (gold: mother at rank 1)
prince is to princess as grandfather is to grandmother (gold: grandmother at rank 1)
prince is to princess as grandpa is to grandma (gold: grandma at rank 1)
prince is to princess as grandson is to granddaughter (gold: granddaughter at rank 1)
prince is to princess as groom

stepson is to stepdaughter as dad is to mom (gold: mom at rank 1)
stepson is to stepdaughter as father is to mother (gold: mother at rank 1)
stepson is to stepdaughter as grandfather is to grandmother (gold: grandmother at rank 1)
stepson is to stepdaughter as grandpa is to grandma (gold: grandma at rank 1)
stepson is to stepdaughter as grandson is to granddaughter (gold: granddaughter at rank 1)
stepson is to stepdaughter as groom is to bride (gold: bride at rank 1)
stepson is to stepdaughter as he is to she (gold: she at rank 1)
stepson is to stepdaughter as his is to her (gold: her at rank 1)
stepson is to stepdaughter as husband is to wife (gold: wife at rank 1)
stepson is to stepdaughter as king is to queen (gold: queen at rank 1)
stepson is to stepdaughter as man is to woman (gold: woman at rank 1)
stepson is to stepdaughter as nephew is to niece (gold: niece at rank 1)
stepson is to stepdaughter as policeman is to wounded (gold: policewoman at rank 5)
stepson is to stepdaughter 

(0.9025649790815626, defaultdict(int, {False: 76, True: 430}))

In [101]:
analogy_evaluation(emb_df.loc[giga20.index], src_filename='gram1-adjective-to-adverb.txt', verbose=False)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


(0.4355626943709446, defaultdict(int, {False: 666, True: 326}))

In [102]:
analogy_evaluation(emb_df.loc[giga20.index], src_filename='gram2-opposite.txt', verbose=False)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


(0.474233438099138, defaultdict(int, {False: 504, True: 308}))

In [144]:
analogy_evaluation(emb_df.loc[giga20.index], src_filename='gram3-comparative.txt', verbose=False)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


(0.8872043722165738, defaultdict(int, {False: 216, True: 1116}))

In [135]:
analogy_evaluation(emb_df.loc[giga20.index], src_filename='gram5-present-participle.txt', verbose=False)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


(0.8611142435397475, defaultdict(int, {False: 204, True: 852}))

In [143]:
analogy_evaluation(emb_df.loc[giga20.index], src_filename='gram8-plural.txt', verbose=False)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


(0.8716714306575464, defaultdict(int, {False: 255, True: 1077}))

In [141]:
analogy_evaluation(emb_df.loc[giga20.index], src_filename='capital-world.txt', verbose=False)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


(0.95081689665599, defaultdict(int, {False: 285, True: 4239}))

## 4. t-SNE Visualization

In [None]:
vsm.tsne_viz(emb_df.loc[giga20.index])

In [191]:
len(set(giga20.index) & set(glove_df.index) & set(emb_df.index))

4702

## Reference
### Torchtext
- [Torchtext documentation](http://torchtext.readthedocs.io/en/latest/index.html)
- [Torchtext tutorial](http://anie.me/On-Torchtext/)
- [Sentiment analysis using torchtext](https://medium.com/@sonicboom8/sentiment-analysis-torchtext-55fb57b1fab8)

### Others
- [Python3 multi-threading](https://www.ploggingdev.com/2017/01/multiprocessing-and-multithreading-in-python-3/)