# Load Word Embeddings
* We load pretrained word embeddings. Note, these word embeddings should be trained on the same corpus or else they won't be of any use. 
* `complete-512.vec` is the 512-dim word embeddings that was trained on the 50,000 PDFs we downloaded fromt arxiv. 
* We use gensim to load these word embeddings.

In [2]:
from gensim.models import KeyedVectors
filename = "complete-512.vec"
en_model = KeyedVectors.load_word2vec_format(filename)

pretrained_words = set()
for word in en_model.vocab:
    pretrained_words.add(word)   

# Obtain top-k most relevant topics
* So, we use these pretrained word embeddings and we find average WE for a given abstract using the words for whom word embeddings are available in the pretrained model. 
* Similarly, an average word embedding for each topic of this abstract. 
* Then, we sort the topics in accordance with their cosine similarity score with the abstract. 
* Pick the top-k

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def filter_relevant_topics(j, k):
    abstract = j["abstract"].split()
    a_emb = []
    for a in abstract:
        if a in pretrained_words:
            a_emb.append(en_model[a])
    a_emb = np.array(sum(a_emb) / len(a_emb)).reshape(1,-1)
    
    topics = j["topics"]      
    topics_emb = []
    for t in topics:
        t = t.split()
        t_emb = []
        for atom in t:
            if atom in pretrained_words:
                t_emb.append(en_model[atom])        
        if t_emb:        
            topics_emb.append((np.array(sum(t_emb) / len(t_emb)).reshape(1, -1), " ".join(t)))
    
    ans = []
    for te, topic in topics_emb:
        ans.append((topic, cosine_similarity(a_emb, te)))
        
    ans = sorted(ans, key=lambda x: x[1], reverse=True)
    if len(ans) >= k:
        return ans[:k]
    
    return ans    


# Process entire dataset
* For every abstract in our corpus, obtain the top-k topics by cosine similarity relevance to the average abstract embedding. 

In [10]:
import json
k = 3
found_k = []
with open("LARGE-CORPUS.txt") as f:
    for line in f:
        j = json.loads(line)
        most_relevant = filter_relevant_topics(j)
        if len(most_relevant) == k:
            found_k.append((j["title"], [t for t,_ in most_relevant]))


# Randomly viewing chosen topics

In [15]:
import random
random.choice(found_k)

('visual scene representations : contrast , scaling and occlusion',
 ['sampling ( signal processing )', 'computer vision', 'approximation'])