In [3]:
import numpy as np
from tqdm import tqdm

glove_dimensions = 300


def load_glove():
    n = 400000  # I opened the file in vscode to get this number
    # https://nlp.stanford.edu/projects/glove/
    glove_EMBEDDING_FILE = open('datasets/glove.6B/glove.6B.300d.txt')
    words = [""] * n
    embeddings = np.empty([n, glove_dimensions])

    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float32')

    i = 0
    for row in glove_EMBEDDING_FILE:
        word, embedding = get_coefs(*row.split(" "))
        words[i] = word
        embeddings[i] = embedding
        i += 1
    return words, embeddings


# https://fasttext.cc/docs/en/english-vectors.html
# wiki-news-300d-1M.vec.zip
wiki_dimensions = 300


def load_wiki_news():
    n = 1000000
    wiki_EMBEDDING_FILE = open('datasets/wiki-news-300d-1M.vec')
    words = [""] * n
    embeddings = np.empty([n, wiki_dimensions])

    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float32')

    i = 0
    for row in tqdm(wiki_EMBEDDING_FILE):
        word, embedding = get_coefs(*row.split(" "))
        if len(embedding) != wiki_dimensions:
            print("skip")
            continue
        words[i] = word
        embeddings[i] = embedding
        i += 1
    return words, embeddings


# https://www.kaggle.com/datasets/ranik40/paragram-300-sl999?resource=download
para_dimensions = 300


def load_paragram():
    n = 66199  # I opened the file in vscode to get this number
    para_EMBEDDING_FILE = open('datasets/paragram_300_sl999.txt')
    words = [""] * n
    embeddings = np.empty([n, wiki_dimensions])

    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float32')

    i = 0
    for row in tqdm(para_EMBEDDING_FILE):
        word, embedding = get_coefs(*row.split(" "))
        if len(embedding) != wiki_dimensions:
            print("skip")
            continue
        words[i] = word
        embeddings[i] = embedding
        i += 1
    return words, embeddings


#all_dimensions = glove_dimensions + wiki_dimensions + para_dimensions
all_dimensions = 300  # adding the emeddings is fine cause https://randorithms.com/2020/11/17/Adding-Embeddings.html

In [4]:

#glove_words, glove_embeddings = load_glove()
#np.save('embeddings/glove_embeddings.npy', glove_embeddings)
#np.save('embeddings/glove_words.npy', glove_words)
#wiki_words, wiki_embeddings = load_wiki_news()
#np.save('embeddings/wiki_embeddings.npy', wiki_embeddings)
#np.save('embeddings/wiki_words.npy', wiki_words)
#para_words, para_embeddings = load_paragram()
#np.save('embeddings/para_embeddings.npy', para_embeddings)
#np.save('embeddings/para_words.npy', para_words)

In [5]:
glove_embeddings = np.load('embeddings/glove_embeddings.npy')
glove_words = np.load('embeddings/glove_words.npy')
wiki_embeddings = np.load('embeddings/wiki_embeddings.npy')
wiki_words = np.load('embeddings/wiki_words.npy')
para_embeddings = np.load('embeddings/para_embeddings.npy')
para_words = np.load('embeddings/para_words.npy')
glove_dict = dict(zip(glove_words, glove_embeddings))
wiki_dict = dict(zip(wiki_words, wiki_embeddings))
para_dict = dict(zip(para_words, para_embeddings))
words = np.array(list(set(np.concatenate([glove_words, wiki_words, para_words]))))
n = words.size
embeddings = [""] * n
for i in tqdm(range(n)):
    word = words[i]
    vec = np.zeros([all_dimensions])
    if word in glove_dict:
        #vec += np.concatenate([glove_dict[word], np.zeros(600)])
        vec += glove_dict[word]
    if word in wiki_dict:
        #vec += np.concatenate([np.zeros(300), wiki_dict[word], np.zeros(300)])
        vec += wiki_dict[word]
    if word in para_dict:
        #vec += np.concatenate([np.zeros(600), para_dict[word]])
        vec += para_dict[word]
    embeddings[i] = vec

np.save('embeddings/all_embeddings.npy', embeddings)
np.save('embeddings/all_words.npy', words)

100%|██████████| 1229131/1229131 [00:13<00:00, 89263.91it/s] 


In [6]:
embeddings = np.load('embeddings/all_embeddings.npy')
words = np.load('embeddings/all_words.npy')
word_to_vec = dict(zip(words, embeddings))

In [7]:
def get_k_closest_words(query_vector, words, matrix, k):
    scores = matrix.dot(query_vector)
    best_indexes = np.argpartition(scores, -k)[-k:]
    #best_indexes = np.argmax(res)
    best_words = np.take(words, best_indexes)
    best_scores = np.take(scores, best_indexes)
    ans = list(zip(best_scores, best_words))
    ans.sort(reverse=True)
    return [word for _, word in ans]

In [8]:
query_vector = word_to_vec["love"]
get_k_closest_words(query_vector, words, embeddings, 5)

['love', 'loves', 'loving', 'lover', 'loved']

In [10]:
def query_for_words(qwords, word_to_vec):
    nwords = len(qwords)
    ans = np.zeros([all_dimensions])
    for word in qwords:
        ans += word_to_vec[word]
    print(np.linalg.norm(ans))
    return ans / nwords


qvector = query_for_words(["organized", "led"], word_to_vec)

#print(query_vector)
get_k_closest_words(qvector, words, embeddings, 5)

16.16255710034245


['organized', 'led', 'organised', 'conducted', 'headed']