In [112]:
import numpy as np

from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
from scipy import stats

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
torch.manual_seed(1)

import time
import datetime

from collections import defaultdict
from collections import Counter

from random import randint
import pickle

In [9]:
import nltk
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

In [150]:
min_count = 2
threshold_for_subsampling = 0.00001 #recommended in Mikolov 2013
neg_context_k = 5 #negative samples
embedding_dim = 100
batch_size = 50
epochs = 10
lr = 0.01
window_size = 5 #dynamic given this value

#do we include punctuation marks?
#what to do about stop words, function words?
#what to do about infrequent words?
#what to do about numbers?
#what should be the output? which candidates should be selected?
#we removed \n
#what to do for word embedding initialization
# Improving Distributional Similarity with Lessons Learned fromWord Embeddings
# Omer Levy Yoav Goldberg Ido Dagan Computer

In [220]:
dataset = 'wa/dev.en'

#read the files
with open(dataset) as f:
    sentences = [l.strip() for l in f.readlines()]

#get all the tokens from the corpus

tokens_list = []
sentence_list = []
for s in sentences:
    split_sent = s.split()
    sentence = []
    for w in split_sent:
        
        tokens_list.append(w)
        sentence.append(w)
#         #filter stopwords
#         if w not in stopwords:
#             tokens_list.append(w)
#             sentence.append(w)
    
    sentence_list.append(sentence)
    
tokens = list(sorted(set(tokens_list)))
print('Vocabulary size', len(tokens))

count_tokens = Counter(tokens_list)

#words appearing fewer than this are not considered words or contexts
#subsample frequent words

temp_sentence_list = []

for s in range(len(sentence_list)):
    temp_sentence_list.append(sentence_list[s])

#Find the infrequent words
for s in range(len(temp_sentence_list)):
    for w in range(len(temp_sentence_list[s])):
        word = temp_sentence_list[s][w]
        if count_tokens[word] < min_count:
            sentence_list[s][w] = '<unk>'
            
            #remove the infrequent words from tokens
            tokens.remove(word)

#we prune rare words altogether

#tokens.append('<unk>')
vocab_size = len(tokens)
print('After rare word pruning', vocab_size)
#remove the infrequent words from the sentences
for s in sentence_list:
    s[:] = [w for w in s if w != '<unk>']

flat_token_list = []

for s in sentence_list:
    for w in s:
        flat_token_list.append(w)

Vocabulary size 322
After rare word pruning 84


In [222]:
subsample_probs = defaultdict(float) 

count_words = Counter(flat_token_list)
print(count_words)
print(sum(count_words.values()))

for t in count_words:
    
    #Distributed Representations of Words and Phrases and their Compositionality
    #Mikolov 2013
    #p = 1 - sqrt(t/f)
    subsample_probs[t] = 1 - np.sqrt(threshold_for_subsampling/(count_words[t]/len(flat_sentence_list)))
    
temp_sentence_list = []

for s in range(len(sentence_list)):
    temp_sentence_list.append(sentence_list[s])

subsampled_sentence_list = []

#Prune frequent words
for s in range(len(temp_sentence_list)):
    sent = []
    for w in range(len(temp_sentence_list[s])):
        
        word = temp_sentence_list[s][w]
        word_prob = subsample_probs[word]
        
        rand_prob = np.random.rand(1)[0]
       
        if rand_prob < word_prob:
            #keep word
            sent.append(temp_sentence_list[s][w])
    
    subsampled_sentence_list.append(sent)

Counter({'.': 34, 'the': 33, 'to': 23, ',': 17, 'of': 14, 'in': 14, 'and': 13, 'that': 12, 'for': 11, 'it': 11, 'we': 11, 'a': 10, 'have': 9, 'is': 7, 'they': 7, 'be': 7, 'are': 6, 'I': 6, 'as': 6, 'very': 5, 'not': 5, 'our': 5, 'been': 5, 'this': 5, '$': 4, 'do': 4, 'what': 4, 'them': 3, 'which': 3, 'people': 3, 'on': 3, 'by': 3, 'one': 3, 'does': 3, 'Bill': 3, '-': 3, 'has': 3, 'there': 3, 'public': 3, 'or': 3, 'at': 3, 'however': 3, 'would': 3, 'want': 3, 'no': 3, 'will': 3, 'Mr.': 3, 'women': 3, 'complex': 2, 'whole': 2, 'most': 2, 'today': 2, 'who': 2, 'effective': 2, 'so': 2, 'let': 2, 'think': 2, 'should': 2, 'look': 2, 'cost': 2, 'some': 2, 'given': 2, 'see': 2, 'up': 2, 'say': 2, 'than': 2, 'programs': 2, 'within': 2, 'country': 2, 'support': 2, '?': 2, 'make': 2, 'Prime': 2, 'Minister': 2, 'need': 2, 'action': 2, 'under': 2, 'Speaker': 2, 'pretends': 2, 'employment': 2, 'equity': 2, 'job': 2, 'growers': 2, 'begin': 2})
423


In [223]:
pickle.dump(subsampled_sentence_list, open('subsampled_dataset.p', 'wb'))

In [224]:
subsampled_sentence_list = pickle.load(open('subsampled_dataset.p', 'rb'))

In [225]:
flat_subsampled_token_list = []

for s in subsampled_sentence_list:
    for w in s:
        flat_subsampled_token_list.append(w)

unigram_counts = Counter(flat_subsampled_token_list)

unigram_probs = defaultdict(float) 

for t in unigram_counts:
    unigram_probs[t] = unigram_counts[t]/len(flat_subsampled_token_list)

pickle.dump(unigram_probs, open('unigram_probs.p', 'wb'))

In [226]:
unigram_probs = pickle.load(open('unigram_probs.p', 'rb'))

In [227]:
def sampled_window(sentence, central_word_index, window_size):
    
    #implicit weighing of the context words
    #dynamic context window size for each word
    dynamic_window_size = randint(1,window_size)
    
    #dws = 2, i-2,i-1,i+1,i+2
    
    pairs = []
    word = sentence[central_word_index]
    for w in range(1,dynamic_window_size+1):
        
        left_cont = central_word_index - w
        right_cont = central_word_index + w
        
        #find the window words to the left and right
        #add as pair if they are inside sentence boundaries
        
        if left_cont > -1:
            pairs.append((word, sentence[left_cont]))
        
        if right_cont < len(sentence):
            pairs.append((word, sentence[right_cont]))
        
    return pairs

In [228]:
neg_sample_probs = defaultdict(float)

neg_normalizer = 0.0

for t in unigram_counts:
    neg_sample_probs[t] = np.power(unigram_counts[t], 0.75)
    neg_normalizer += neg_sample_probs[t]
    

for n in neg_sample_probs:
    neg_sample_probs[n] = neg_sample_probs[n] / neg_normalizer

In [229]:
#à la Mikolov
#table to pick negative context words
#fill the table with each word, count = negsampling prob*table size
neg_table_size = 10000000

unigram_table = np.empty(neg_table_size, dtype=object)
i = 0

for n in neg_sample_probs:
    count = int(neg_sample_probs[n] * neg_table_size)
    
    for c in range(count):
        unigram_table[i] = n
        
        i += 1
    
unigram_table = unigram_table[unigram_table != np.array(None)]
np.random.shuffle(unigram_table)
print(i)
print(len(unigram_table))
print(unigram_table[0:10])

9999973
9999973
['of' 'the' 'should' 'to' 'as' 'what' 'within' 'public' 'would' 'it']


In [259]:
def pick_neg_context(table, k):
    
    neg_samples = []
    
    for i in range(k):
        
        r = randint(0,len(table)-1)
        neg_samples.append(table[r])
        
    return neg_samples    

In [251]:
pos_context_pairs = []
neg_context_samples = []

for sentence in sentence_list:
    for w in range(len(sentence)):
        pos_context_pairs.extend(sampled_window(sentence, w, window_size))
        
        neg_context_samples.append((sentence[w], pick_neg_context(unigram_table, neg_context_k)))

pickle.dump(pos_context_pairs, open('pos_context_pairs.p', 'wb'))
pickle.dump(neg_context_samples, open('neg_context_samples.p', 'wb'))

In [252]:
pos_context_pairs = pickle.load(open('pos_context_pairs.p', 'rb'))
neg_context_samples = pickle.load(open('neg_context_samples.p', 'rb'))

In [250]:
tokens = list(set(flat_subsampled_token_list))

#default dictionary key:id value:token
id2tokens = defaultdict(str)
                        
for i in range(len(tokens)):
    id2tokens[i] = tokens[i]
    
#default dictionary key:token value:id
tokens2id = defaultdict(int)

for ind in id2tokens:
    tokens2id[id2tokens[ind]] = ind
    
vocabulary_size = len(tokens2id)
print(vocabulary_size)

84


In [243]:
#convert dataset to ids
pos_data = []
neg_data = []

for p in pos_context_pairs:
    pos_data.append((tokens2id[p[0]], tokens2id[p[1]]))
    
for n in neg_context_samples:
    
    word_id = tokens2id[n[0]]
    neg_samples = n[1]
    
    neg_ids = []
    
    for ns in neg_samples:
        neg_ids.append(tokens2id[ns])
        
    neg_data.append((word_id, neg_ids))

In [247]:
print(pos_data[0], pos_context_pairs[0], neg_context_samples[0], neg_data[0])

(41, 64) ('of', 'them') ('of', ['it', '?', 'have', 'the', 'and']) (41, [44, 47, 19, 10, 79])


In [257]:
pickle.dump(pos_data, open('pos_data.p', 'wb'))
pickle.dump(neg_data, open('neg_data.p', 'wb'))

In [258]:
pos_data = pickle.load(open('pos_data.p', 'rb'))
neg_data = pickle.load(open('neg_data.p', 'rb'))