In [1]:
import numpy as np

from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
from scipy import stats

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
torch.manual_seed(1)

import time
import datetime

from collections import defaultdict
from collections import Counter

from random import randint
import pickle
import nltk
import string
puncs = set(string.punctuation)

In [2]:
min_count = 20
threshold_for_subsampling = 0.00001 #recommended in Mikolov 2013
neg_context_k = 5 #negative samples
embedding_dim = 100
batch_size = 100
window_size = 5 #dynamic given this value

#removed punctuation marks, removed infrequent ones, subsampled frequent ones
#negative samples done 

#what should be the output? which candidates should be selected?
#what to do for word embedding initialization
# Improving Distributional Similarity with Lessons Learned fromWord Embeddings
# Omer Levy Yoav Goldberg Ido Dagan Computer

In [3]:
def sampled_window(sentence, central_word_index, window_size):
    
    #implicit weighing of the context words
    #dynamic context window size for each word
    dynamic_window_size = randint(1,window_size)
    
    #dws = 2, i-2,i-1,i+1,i+2
    
    pairs = []
    word = sentence[central_word_index]
    for w in range(1,dynamic_window_size+1):
        
        left_cont = central_word_index - w
        right_cont = central_word_index + w
        
        #find the window words to the left and right
        #add as pair if they are inside sentence boundaries
        
        if left_cont > -1:
            pairs.append((word, sentence[left_cont]))
            
        if right_cont < len(sentence):
            pairs.append((word, sentence[right_cont]))
            
    return pairs

In [4]:
def pick_neg_context(table, k, word, pos_pairs_dict):
    
    neg_samples = []
    
    for i in range(k):
        
        r = randint(0,len(table)-1)
        
        possible_neg = table[r]
        
        while possible_neg in pos_pair_dict[word]:
            
            r = randint(0,len(table)-1)
        
            possible_neg = table[r]
                    
        neg_samples.append(possible_neg)
        
    return neg_samples    

In [5]:
# dataset = 'europarl/training.en'
# label = '_europarl_en'

# dataset = 'europarl/training.fr'
# label = '_europarl_fr'

dataset = 'hansards/training.en'
label = '_hansard_en'

# dataset = 'hansards/training.fr'
# label = '_hansard_fr'

# dataset = 'wa/dev.en'
# label = '_dev'

#read the files
with open(dataset) as f:
    sentences = [l.strip() for l in f.readlines()]

#get all the tokens from the corpus

tokens_list = []
sentence_list = []
for s in sentences:
    split_sent = s.split()
    sentence = []
    for w in split_sent:
        
        tokens_list.append(w)
        sentence.append(w)
#         #filter stopwords
#         if w not in stopwords:
#             tokens_list.append(w)
#             sentence.append(w)
    
    sentence_list.append(sentence)
    
tokens = list(sorted(set(tokens_list)))
print('Vocabulary size', len(tokens))

count_tokens = Counter(tokens_list)

#words appearing fewer than this are not considered words or contexts
#subsample frequent words

Vocabulary size 36635


In [8]:
#Find the infrequent words
for s in range(len(sentence_list)):
    for w in range(len(sentence_list[s])):
        word = sentence_list[s][w]
        if count_tokens[word] < min_count:
            sentence_list[s][w] = '<unk>'
            
            #remove the infrequent words from tokens
            if   word in tokens:
                tokens.remove(word)

#we prune rare words altogether

tokens.append('<unk>')
vocab_size = len(tokens)
print('After rare word pruning', vocab_size)

After rare word pruning 7981


In [9]:
#remove the infrequent words from the sentences
for s in sentence_list:
    s[:] = [w for w in s if w != '<unk>' and w not in puncs]

flat_token_list = []

for s in sentence_list:
    for w in s:
        flat_token_list.append(w)

In [10]:
subsample_probs = defaultdict(float) 

count_words = Counter(flat_token_list)

for c in count_words:
    
    #Distributed Representations of Words and Phrases and their Compositionality
    #Mikolov 2013
    #p = 1 - sqrt(t/f)
    #Levy&Goldberg, actual implementation in code p = (f-t)/f - sqrt(t/f)
    
    f = count_words[c]/len(flat_token_list)
    t = threshold_for_subsampling
    #subsample_probs[t] = 1 - np.sqrt(threshold_for_subsampling/(count_words[t]/len(flat_token_list)))
    subsample_probs[c] =  (f-t)/f - np.sqrt(t/f)
    
temp_sentence_list = []

for s in range(len(sentence_list)):
    temp_sentence_list.append(sentence_list[s])

subsampled_sentence_list = []

#Prune frequent words
for s in range(len(temp_sentence_list)):
    sent = []
    for w in range(len(temp_sentence_list[s])):
        
        word = temp_sentence_list[s][w]
        word_prob = subsample_probs[word]
        
        rand_prob = np.random.rand(1)[0]
       
        if rand_prob < word_prob:
            #keep word
            sent.append(temp_sentence_list[s][w])
    
    subsampled_sentence_list.append(sent)

In [11]:
pickle.dump(subsampled_sentence_list, open('subsampled_dataset' + label + '.p', 'wb'))

subsampled_sentence_list = pickle.load(open('subsampled_dataset' + label + '.p', 'rb'))

In [12]:
flat_subsampled_token_list = []

for s in subsampled_sentence_list:
    for w in s:
        flat_subsampled_token_list.append(w)

unigram_counts = Counter(flat_subsampled_token_list)

unigram_probs = defaultdict(float) 

for t in unigram_counts:
    unigram_probs[t] = unigram_counts[t]/len(flat_subsampled_token_list)

pickle.dump(unigram_probs, open('unigram_probs' + label + '.p', 'wb'))

unigram_probs = pickle.load(open('unigram_probs' + label + '.p', 'rb'))

In [13]:
neg_sample_probs = defaultdict(float)

neg_normalizer = 0.0

for t in unigram_probs:
    neg_sample_probs[t] = np.power(unigram_probs[t], 0.75)
    neg_normalizer += neg_sample_probs[t]
    
for n in neg_sample_probs:
    neg_sample_probs[n] = neg_sample_probs[n] / neg_normalizer

In [14]:
#à la Mikolov
#table to pick negative context words
#fill the table with each word, count = negsampling prob*table size
neg_table_size = 10000000

unigram_table = np.empty(neg_table_size, dtype=object)
i = 0

for n in neg_sample_probs:
    count = int(neg_sample_probs[n] * neg_table_size)
    
    for c in range(count):
        unigram_table[i] = n
        
        i += 1
    
unigram_table = unigram_table[unigram_table != np.array(None)]
np.random.shuffle(unigram_table)
print(i)
print(len(unigram_table))
print(unigram_table[0:10])

9998286
9998286
['lives' 'water' 'this' 'negative' 'as' 'any' 'public' 'while' 'created'
 'to']


In [15]:
pos_context_pairs = []
neg_context_samples = []

for sentence in sentence_list:
    for w in range(len(sentence)):
        
        pos_context_pairs.extend(sampled_window(sentence, w, window_size))


pickle.dump(pos_context_pairs, open('pos_context_pairs' + label + '.p', 'wb'))


In [16]:
pos_pair_dict = defaultdict(set)
        
for pair in pos_context_pairs:
    pos_pair_dict[pair[0]].add(pair[1])

In [17]:
with open('pos_pair_dict' + label + '.p', 'wb') as f:
    pickle.dump(pos_pair_dict, f)

In [18]:
with open('pos_pair_dict' + label + '.p', 'rb') as f:
    pos_pair_dict = pickle.load(f)

In [19]:
for p in range(len(pos_context_pairs)):
    
    pair = pos_context_pairs[p]
    neg_context_samples.append((pair[0], pick_neg_context(unigram_table, neg_context_k, pair[0], pos_pair_dict)))
    
pickle.dump(neg_context_samples, open('neg_context_samples' + label + '.p', 'wb'))

In [20]:
print(len(pos_context_pairs), len(neg_context_samples))
print(pos_context_pairs[0], neg_context_samples[0])

17127853 17127853
('division', '68') ('division', ['bring', 'decide', 'information', 'witnesses', 'Nisga'])


In [21]:
pos_context_pairs = pickle.load(open('pos_context_pairs' + label + '.p', 'rb'))
neg_context_samples = pickle.load(open('neg_context_samples' + label + '.p', 'rb'))

In [22]:
tokens = list(set(flat_subsampled_token_list))

#default dictionary key:id value:token
id2tokens = defaultdict(str)
                        
for i in range(len(tokens)):
    id2tokens[i] = tokens[i]
    
#default dictionary key:token value:id
tokens2id = defaultdict(int)

for ind in id2tokens:
    tokens2id[id2tokens[ind]] = ind
    
vocabulary_size = len(tokens2id)
print(vocabulary_size)

3276


In [23]:
#convert dataset to ids
pos_data = []
neg_data = []

for p in pos_context_pairs:
    pos_data.append((tokens2id[p[0]], tokens2id[p[1]]))
    
for n in neg_context_samples:
    
    word_id = tokens2id[n[0]]
    neg_samples = n[1]
    
    neg_ids = []
    
    for ns in neg_samples:
        neg_ids.append(tokens2id[ns])
        
    neg_data.append((word_id, neg_ids))

In [24]:
print(len(pos_data), len(neg_data), len(pos_context_pairs), len(neg_context_samples))

17127853 17127853 17127853 17127853


In [25]:
print(pos_data[0], pos_context_pairs[0], neg_context_samples[0], neg_data[0])

(747, 1469) ('division', '68') ('division', ['bring', 'decide', 'information', 'witnesses', 'Nisga']) (747, [1747, 2938, 1613, 2526, 345])


In [26]:
pickle.dump(pos_data, open('pos_data' + label + '.p', 'wb'))
pickle.dump(neg_data, open('neg_data' + label + '.p', 'wb'))

In [27]:
with open('pos_data' + label + '.p', 'rb') as f:
    pos_data = pickle.load(f)
    
with open('neg_data' + label + '.p', 'rb') as f:
    neg_data = pickle.load(f)

In [6]:
with open('unigram_probs' + label +'.p', 'rb') as f:
    unigram_probs = pickle.load(f)

tokens = list(unigram_probs.keys())
#default dictionary key:id value:token
id2tokens = defaultdict(str)
                        
for i in range(len(tokens)):
    id2tokens[i] = tokens[i]
    
#default dictionary key:token value:id
tokens2id = defaultdict(int)

for ind in id2tokens:
    tokens2id[id2tokens[ind]] = ind
    
vocabulary_size = len(tokens2id)
print(vocabulary_size)

3276


In [7]:
with open('tokens2id' +label+'.pickle', 'wb') as f:
    pickle.dump(tokens2id,f)
with open('id2tokens' +label+'.pickle', 'wb') as f:
    pickle.dump(id2tokens,f)