# Prepare Glove vector

In [1]:
import numpy as np
import pickle
import operator

In [2]:
def loadGloveModel(gloveFile):
    print ("Loading Glove Model")
    
    model = {}
    with open(gloveFile,'r') as f:
        data = f.readlines()
        
        for line in data:
            try:
                splitLine = ''
                splitLine = line.split(' ')
                word = str(splitLine[0])
                embedding = [float(val) for val in splitLine[1:]]
                assert len(embedding) == 300

                model[word] = embedding
            except Exception as e:
                print('error:', word, embedding, len(embedding),  e)
                break
                
        print ("Done.",len(model)," words loaded!")
        
    return model

In [3]:
def cal_coverage(list_voca, glove):

    cnt = 0
    for token in list_voca:
        if token in glove:
            ;
        else:
            cnt = cnt + 1  

    print ('# missing token : ' + str(cnt))
    print ('coverage : ' + str( 1 - ( cnt/ float(len(list_voca)) ) ) )

In [4]:
def create_glove_embedding(list_voca, glove, pad_token='_PAD_'):

    print('be aware: pad token should be placed in the first index of list_voca')
    
    list_glove_voca = []

    cnt = 0

    for token in list_voca:

        if token in glove:
            list_glove_voca.append( glove[token] )
        else:
            if token == pad_token:
                print ('add PAD as 0s')
                assert len(list_glove_voca) == 0
                list_glove_voca.append( np.zeros(300) )  
            else:
                list_glove_voca.append( np.random.uniform(-0.25, 0.25, 300).tolist() )
                cnt = cnt + 1

    print ('coverage : ' + str( 1 - ( cnt/ float(len(list_voca)) ) ))
    return list_glove_voca

## 01 load glove model

In [5]:
glove = loadGloveModel('../data/raw/embedding/glove.840B.300d.txt')

Loading Glove Model
Done. 2196016  words loaded!


## 02 read dictionary

In [13]:
list_dic = []
# with open('../data/news-19_paragraph_swap-random-1m/whole/dic_mincutN.txt', 'r') as f:
# with open('../data/headline_swap_news_v2/whole/dic_mincutN.txt', 'r') as f:
# with open('../data/headline_swap_news_v2.5/whole/dic_mincutN.txt', 'r') as f:
with open('../data/headline_swap_news_v2.5_mf8/whole/dic_mincutN.txt', 'r') as f:
# with open('../data/paragraph_swap_news_v2.5/whole/dic_mincutN.txt', 'r') as f:
    list_dic = f.readlines()
    list_dic = [x.strip() for x in list_dic]
print('dic size:', len(list_dic))
cal_coverage(list_dic, glove)

dic size: 420830
# missing token : 189030
coverage : 0.550816244089062


In [14]:
# list_glove_voca = create_glove_embedding(list_dic, glove, pad_token='_PAD_')
list_glove_voca = create_glove_embedding(list_dic, glove, pad_token='')

print('glove size:', len(list_glove_voca))
print('pad:', list_glove_voca[0])

be aware: pad token should be placed in the first index of list_voca
add PAD as 0s
coverage : 0.5508186203455077
glove size: 420830
pad: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

## 03 store as numpy

In [15]:
np_glove = np.asarray(list_glove_voca, dtype=np.float32)
print (np.shape(np_glove))

(420830, 300)


In [16]:
# np.save('../data/news-19_paragraph_swap-random-1m/whole/W_embedding.npy', np_glove)
# np.save('../data/headline_swap_news_v2/whole/W_embedding.npy', np_glove)
# np.save('../data/headline_swap_news_v2.5/whole/W_embedding.npy', np_glove)
np.save('../data/headline_swap_news_v2.5_mf8/whole/W_embedding.npy', np_glove)
# np.save('../data/paragraph_swap_news_v2.5/whole/W_embedding.npy', np_glove)