# Pseudo Document Generations

Import packages

In [20]:
from spherecluster import  VonMisesFisherMixture, sample_vMF
import re
import warnings

warnings.filterwarnings(action='ignore')

In [21]:
import pandas as pd
import numpy as np
from gensim.models import  KeyedVectors

Load word vectors

In [22]:
filename = "word vectors.kv"
model = KeyedVectors.load(filename, mmap='r')

In [23]:
word_embedding = np.array(model.wv.vectors)
vocab = list(model.wv.vocab)

Normalizing word embedding weights

In [24]:
linfnorm = np.linalg.norm(word_embedding, axis=1, ord=2)
word_embedding = word_embedding / linfnorm[:,None]

In [25]:
words_df = pd.DataFrame(word_embedding.T, columns=vocab)

load classes and keywords

In [26]:
class_keywords_str  = open('class keywords.txt', encoding='utf-8').read()
class_keywords = {i.split(': ')[0]: i.split(': ')[1].split(', ') for i in class_keywords_str.split('\n')}

In [27]:
#class_keywords

Obtain word vectors for each keyword from every class

In [28]:
class_keywords = {topic: [i[0] for i in model.wv.most_similar (topic, topn = 100)] for topic in class_keywords.keys()}
class_keywords_supplied = {class_label: [np.array(words_df[word]) for word in words] 
                           for class_label, words in class_keywords.items()}

fit a vMF distribution for every class, obtain cluster centers and dispersion parameters

In [29]:
topic_vMFs = {}

for i in class_keywords_supplied.keys():
    keyword_mtx = np.vstack(class_keywords_supplied[i])
    vmF = VonMisesFisherMixture(n_clusters=1, n_jobs=10, max_iter= 20)
    vmF.fit(keyword_mtx)
    topic_vMFs[i] = (vmF.cluster_centers_[0], vmF.concentrations_[0])

Build background words

Load word counts

In [30]:
word_counts = np.load('word_counts.npy').item()
total_length = sum(word_counts.values())

Sample from background word distributions

In [31]:
word_distributions = {i : word_count/total_length for i, word_count in word_counts.items()}
word_distributions = pd.DataFrame.from_records(word_distributions,index=[0])
word_distributions = word_distributions[words_df.columns]
#np.random.choice(list(word_distributions.keys()) + , p = list(word_distributions.values()), size = 1000)

In [32]:
def generateWordDistribution(alpha, word_distributions, top_n_keywords, words_df, topic):
    """ Generates psuedo documents distribution given a topic
    
        Input: alpha - balancing parameter between background words and keywords
               word_distributions - background words distributions
               top_n_keywords - number n, top n class keywords
               word_embedding - word embedding matrix
               topic - topic keywords 
               
        Output: a pseudo document distributions
    """
    mu, kappa = topic_vMFs[topic]
    di = sample_vMF(mu, kappa, num_samples = 1)
    
    di_similarities = np.exp(np.dot(di, words_df.values).ravel()) # find similarities for all words with di
    ranked_index = np.argsort(di_similarities)[::-1] # sort the simlarity values descending, save the indices
    
    # set the simiarity values of all other than top n keywords to 0
    di_similarities[ranked_index[top_n_keywords:]] = 0
    
    # generate document distributions
    keywords_distributions = di_similarities/np.sum(di_similarities)
    background_words = word_distributions.values.ravel()
    
    document_distributions = (alpha* np.array(background_words)
                                      + (1 - alpha)* keywords_distributions.ravel())
    
#    document_distributions = pd.DataFrame(document_distributions, index = list(words_df.columns))
#     keywords_distributions = pd.DataFrame(keywords_distributions, index = vocab)
    
    return document_distributions

Build function that generates pseudo label vector

In [33]:
def generatePseudoLabels(alpha, word_distributions, topic):
    """ Generates psuedo labels given a topic
        Input: alpha - balancing parameter between background words and keywords
               vocab - vocabulary lists
               topic - topic keywords
        Output: a vector similiar to one-hot, with the largest probabilities at the topic keyword
    """
    # generate pseudo label
    background_words = word_distributions.values.ravel()
    label_vector = np.ones(len(background_words))*alpha/len(background_words)
    label_vector[list(word_distributions).index(topic)] += 1 - alpha
    return label_vector

Build function that generates labelled pseudo documents

In [34]:
def generateLabelledPseudoDocuments(alpha, doc_length, num_docs):
    """ Generates psuedo documents given a topic
        Input: alpha - balancing parameter between background words and keywords
               doc_length - length of words in the pseudo document
               num_docs - number of documents in a batch
        Output: a tuple (pseudo docs, pseudo labels)
    
    """
    
    topics = class_keywords_supplied.keys()
    topic_docs = {}
    for topic in topics:
        pseudo_docs = []
        pseudo_labels = []
        for i in range(num_docs):
        
            document_distribution = generateWordDistribution(alpha, word_distributions, 
                                                         20, words_df, topic)
            pseudo_docs.append(np.random.choice(len(document_distribution), size=doc_length, p=document_distribution)) 
            pseudo_labels.append(generatePseudoLabels(alpha, word_distributions, topic))
        
        topic_docs[topic] = (pseudo_docs, pseudo_labels)
        
    return topic_docs

In [None]:
pseudo_docs = generateLabelledPseudoDocuments(alpha = 0.3, doc_length = 1000, num_docs = 1000)
#pseudo_docs = np.load('pseudo_docs.npy').item()

In [None]:
# def generateTrainingData(pseudo_docs, word_embedding):
#     """ Transform the index of corresponding word into its embeddings
#         Input: pseudo_docs - pseudo_docs dictionary
#                word_embedding - word embedding matrix 
#         Output: a stack of word vectors of all words in pseudo docs
    
#     """
#     training_data  = {}
#     for topic in pseudo_docs.keys():
#         pseudo_docs_wv = list(map(lambda x: word_embedding[x, :], pseudo_docs[topic][0]))
#         #pseudo_labels = 
#         training_data[topic] = (pseudo_docs_wv, pseudo_docs[topic][1])
#     return training_data

In [None]:
# training_data = generateTrainingData(pseudo_docs, word_embedding)
# # np.save('training_data.npy', dict(training_data)) 