# Error Analysis based on Cosine Similarity using an embeddings model
## Konstantina Andronikou

### Any type of pre-trained embeddings model can be used for this analysis. If you would like to create your own based on a specific type of data please have a look at the create_embeddings.ipynb.

In [None]:
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import scipy

In [None]:
# Loading a stored model:
embedded_model = KeyedVectors.load_word2vec_format('data/embedding model') 

In [None]:
print('Vector size =', embedded_model.vector_size)
print('Vocabulary size =', len(embedded_model.key_to_index))

In [None]:
#We can check the similarity between two words for example:
print(embedded_model.similarity("word1", "word2"))

### This error analysis will evaluate words in two different ways: Overall cosine similarity and pairwise cosine similarity

In [None]:
#This function takes all possible word pairs within a topic and calculates the overall score of similarity
def cosine_similarity(keyword_list):
    """ Calculates the overall cosine similarity for all topics
        Argument:keyword_list (this refers to the output of the topic model)
        return final: a float representing the overall score of cosine similarity
    """
    first = keyword_list
    second = keyword_list
    results = []
    for k in first:
        for i in second:
            if i != k:
                test = embedded_model.similarity(i,k)
                results.append(test)
    final = sum(results) / len(results)
    return final

In [None]:
# This function can be called with a list containing the topic modeling output - in this case is token level, for example:
topic0 = ['word1','word2','word3']
cosine_similarity(topic0)

### The following step is to get a better idea of the content generated within a topic.

In [None]:
#This function takes all possible word pairs within a topic and calculated their similarity 
def pairwise_cosine_similarity(keyword_list):
    """ Calculates pairwise cosine similarity for all possible word combinations 
        Arguments:keyword_list (this refers to the output of the topic model)
        return results: list including all pairwise cosine similarity scores 
    """
    first = keyword_list
    second = keyword_list
    results = []
    for k in first:
        for i in second:
            if i != k:
                test = embedded_model.similarity(i,k)
                results.append(test)
    return results

In [None]:
pairwise_cosine_similarity(topic0)

## End of Notebook