In [1]:
import re
import nltk
nltk.download('brown')
from nltk.corpus import brown

[nltk_data] Downloading package brown to /home/xwan6/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# load the raw data
import itertools
corpus = []
for cat in ['news']:
    for text_id in brown.fileids(cat):
        raw_text = list(itertools.chain.from_iterable(brown.sents(text_id)))
        text = ' '.join(raw_text)
        text = text.lower()
        text.replace('\n', ' ')
        text = re.sub('[^a-z ]+', '', text)
        corpus.append([w for w in text.split() if w != ''])

In [7]:
corpus[0]

['the',
 'fulton',
 'county',
 'grand',
 'jury',
 'said',
 'friday',
 'an',
 'investigation',
 'of',
 'atlantas',
 'recent',
 'primary',
 'election',
 'produced',
 'no',
 'evidence',
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 'the',
 'jury',
 'further',
 'said',
 'in',
 'termend',
 'presentments',
 'that',
 'the',
 'city',
 'executive',
 'committee',
 'which',
 'had',
 'overall',
 'charge',
 'of',
 'the',
 'election',
 'deserves',
 'the',
 'praise',
 'and',
 'thanks',
 'of',
 'the',
 'city',
 'of',
 'atlanta',
 'for',
 'the',
 'manner',
 'in',
 'which',
 'the',
 'election',
 'was',
 'conducted',
 'the',
 'septemberoctober',
 'term',
 'jury',
 'had',
 'been',
 'charged',
 'by',
 'fulton',
 'superior',
 'court',
 'judge',
 'durwood',
 'pye',
 'to',
 'investigate',
 'reports',
 'of',
 'possible',
 'irregularities',
 'in',
 'the',
 'hardfought',
 'primary',
 'which',
 'was',
 'won',
 'by',
 'mayornominate',
 'ivan',
 'allen',
 'jr',
 'only',
 'a',
 'relative',
 'handful',
 'of

In [6]:
# subsampling frequent words
from collections import Counter
import random, math

def subsample_frequent_words(corpus):
    filtered_corpus = []
    word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
    sum_word_counts = sum(list(word_counts.values()))
    word_counts = {word: word_counts[word]/float(sum_word_counts) for word in word_counts}
    for text in corpus:
        filtered_corpus.append([])
        for word in text:
            if random.random() < (1+math.sqrt(word_counts[word] * 1e3)) * 1e-3 / float(word_counts[word]):
                filtered_corpus[-1].append(word)
    return filtered_corpus

In [10]:
corpus = subsample_frequent_words(corpus)
vocabulary = set(itertools.chain.from_iterable(corpus))

word_to_index = {w: idx for (idx, w) in enumerate(vocabulary)}
index_to_word = {idx: w for (idx, w) in enumerate(vocabulary)}

In [11]:
# build bag of words
import numpy as np

context_tuple_list = []
w = 4

for text in corpus:
    for i, word in enumerate(text):
        first_context_word_index = max(0,i-w)
        last_context_word_index = min(i+w, len(text))
        for j in range(first_context_word_index, last_context_word_index):
            if i!=j:
                context_tuple_list.append((word, text[j]))
print("There are {} pairs of target and context words".format(len(context_tuple_list)))

There are 443768 pairs of target and context words


In [12]:
context_tuple_list[0]

('fulton', 'county')

In [29]:
# get batches
import random

def get_batches(context_tuple_list, batch_size=100):
    random.shuffle(context_tuple_list)
    batches = []
    batch_target, batch_context, batch_negative = [], [], []
    for i in range(len(context_tuple_list)):
        batch_target.append(word_to_index[context_tuple_list[i][0]])
        batch_context.append(word_to_index[context_tuple_list[i][1]])
        batch_negative.append([word_to_index[w] for w in context_tuple_list[i][2]])
        if (i+1) % batch_size == 0 or i == len(context_tuple_list)-1:
            tensor_target = torch.from_numpy(np.array(batch_target)).long()
            tensor_context = torch.from_numpy(np.array(batch_context)).long()
            tensor_negative = torch.from_numpy(np.array(batch_negative)).long()
            batches.append((tensor_target, tensor_context, tensor_negative))
            batch_target, batch_context, batch_negative = [], [], []
    return batches

In [28]:
# get negatie samples
from numpy.random import multinomial

def sample_negative(sample_size):
    sample_probability = {}
    word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
    normalizing_factor = sum([v**0.75 for v in word_counts.values()])
    for word in word_counts:
        sample_probability[word] = word_counts[word]**0.75 / normalizing_factor
    words = np.array(list(word_counts.keys()))
    while True:
        word_list = []
        sampled_index = np.array(multinomial(sample_size, list(sample_probability.values())))
        for index, count in enumerate(sampled_index):
            for _ in range(count):
                 word_list.append(words[index])
        yield word_list

In [27]:
import numpy as np

context_tuple_list = []
w = 4
negative_samples = sample_negative(8)

for text in corpus:
    for i, word in enumerate(text):
        first_context_word_index = max(0,i-w)
        last_context_word_index = min(i+w, len(text))
        for j in range(first_context_word_index, last_context_word_index):
            if i!=j:
                context_tuple_list.append((word, text[j], next(negative_samples)))
print("There are {} pairs of target and context words".format(len(context_tuple_list)))

There are 443768 pairs of target and context words


In [71]:
# create model
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable

In [76]:
class Word2Vec(nn.Module):

    def __init__(self, vocab_size,embedding_size):
        super(Word2Vec,self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.embeddings_target = nn.Embedding(vocab_size, embedding_size, sparse=True)
        self.embeddings_context = nn.Embedding(vocab_size, embedding_size, sparse=True)
        self.init_emb()
        
    def init_emb(self):
        """Initialize embedding weight like word2vec.
        The u_embedding is a uniform distribution in [-0.5/em_size, 0.5/emb_size], and the elements of v_embedding are zeroes.
        Returns:
            None
        """
        initrange = 0.5 / self.embedding_size
        self.embeddings_target.weight.data.uniform_(-initrange, initrange)
        self.embeddings_context.weight.data.uniform_(-0, 0)
        
    def forward(self, target_word, context_word, negative_example):
        emb_target = self.embeddings_target(target_word)
        emb_context = self.embeddings_context(context_word)
        emb_product = torch.mul(emb_target, emb_context).squeeze()
        emb_product = torch.sum(emb_product, dim=1)
        out = torch.sum(F.logsigmoid(emb_product))
        emb_negative = self.embeddings_context(negative_example)
        emb_product = torch.bmm(emb_negative, emb_target.unsqueeze(2))
        emb_product = torch.sum(emb_product, dim=1)
        out += torch.sum(F.logsigmoid(-emb_product))
        return -out

In [77]:
# Early stopping
class EarlyStopping():
    def __init__(self, patience=5, min_percent_gain=0.1):
        self.patience = patience
        self.loss_list = []
        self.min_percent_gain = min_percent_gain / 100.
        
    def update_loss(self, loss):
        self.loss_list.append(loss)
        if len(self.loss_list) > self.patience:
            del self.loss_list[0]
    
    def stop_training(self):
        if len(self.loss_list) == 1:
            return False
        gain = (max(self.loss_list) - min(self.loss_list)) / max(self.loss_list)
        print("Loss gain: {}%".format(round(100*gain,2)))
        if gain < self.min_percent_gain:
            return True
        else:
            return False

In [78]:
# training
vocab_size = len(vocabulary)
embedding_size = 300
model = Word2Vec(vocab_size,embedding_size)
# use gpu
model.cuda()

Word2Vec(
  (embeddings_target): Embedding(12132, 300, sparse=True)
  (embeddings_context): Embedding(12132, 300, sparse=True)
)

In [79]:
# check the total parameters of model
total_param  = []
for p in model.parameters():
    total_param.append(int(p.numel()))
sum(total_param)

7279200

In [88]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),lr=0.025)
early_stopping = EarlyStopping(patience=5, min_percent_gain=1)

In [89]:
# training
import time

# Set model to train
model.train()

epochs = 100
for i in range(epochs):
    losses = []
    context_tuple_batches = get_batches(context_tuple_list, batch_size=2000)
    for i in range(len(context_tuple_batches)):
        optimizer.zero_grad()
        target_tensor, context_tensor, negative_tensor = context_tuple_batches[i]
        target_tensor = Variable(target_tensor)
        context_tensor = Variable(context_tensor)
        negative_tensor = Variable(negative_tensor)
        # use the gpu
        target_tensor = target_tensor.cuda()
        context_tensor = context_tensor.cuda()
        negative_tensor = negative_tensor.cuda()
        
        loss = model(target_tensor, context_tensor, negative_tensor)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print("Loss: ", np.mean(losses))
    #early_stopping.update_loss(np.mean(losses))
    #if early_stopping.stop_training():
        #break

Loss:  2443.624174650725
Loss:  1973.07513482721
Loss:  1887.940788750176
Loss:  1850.3224124392948
Loss:  1819.3338623046875
Loss:  1786.5457466744087
Loss:  1748.750718125352
Loss:  1703.162789009713
Loss:  1647.2410800693272
Loss:  1579.1900167379292
Loss:  1497.3632378105644
Loss:  1402.6745528487472
Loss:  1297.0658305400127
Loss:  1185.3376717782235
Loss:  1071.9927896035683
Loss:  959.3160040228216
Loss:  851.5809922776781
Loss:  748.6580651085656
Loss:  652.9136391029701
Loss:  566.6815698297174
Loss:  489.9911404171506
Loss:  422.633557018933
Loss:  364.29344809592305
Loss:  314.5074945398279
Loss:  272.5231519132047
Loss:  237.43214849523596
Loss:  207.77155022148614
Loss:  182.97723113738738
Loss:  161.8713275806324
Loss:  144.56596340145077
Loss:  129.89649561289195
Loss:  117.32255093686216
Loss:  106.513052931777
Loss:  97.23506267650707
Loss:  88.83453015164213
Loss:  82.10582413544526
Loss:  75.6556672792177
Loss:  70.45898820688059
Loss:  65.7153518264358
Loss:  61.304

In [90]:
emb = model.embeddings_target

In [91]:
emb

Embedding(12132, 300, sparse=True)

In [95]:
emb

Embedding(12132, 300, sparse=True)

In [100]:
emb.weight.detach().cpu().numpy()

array([[-0.07422798,  0.00573754, -0.03458479, ..., -0.00224186,
        -0.01031344, -0.00758016],
       [ 0.28183687,  0.01303123, -0.02787262, ...,  0.38275105,
         0.24691202,  0.0876765 ],
       [-0.01674725, -0.02723128,  0.08544952, ..., -0.00305449,
         0.07062403,  0.05459627],
       ...,
       [ 0.00252927,  0.05711698,  0.03951392, ...,  0.52196735,
         0.38400954, -0.19426127],
       [-0.07613605, -0.25332096, -0.1483818 , ...,  0.1908881 ,
        -0.00184861, -0.02639896],
       [-0.09747757, -0.19539762,  0.37105843, ...,  0.5234165 ,
         0.23794156,  0.09051089]], dtype=float32)