In [2]:
import numpy as np

<img src="cosine-similarity.png" width="500"/>

In [1]:
def cosine_sim(x, y):
    """calculates cosine similarity between 2 vectors.
        
    Parameters
    ----------
    x : numpy.ndarray
        vector representation (of query)
    y : numpy.ndarray
        vector representation (of document)
    
    Returns
    -------
    cosine_sim: numpy.float64
        cosine similarity between vector x and y
    """

    if np.count_nonzero(x) == 0 or np.count_nonzero(y) == 0:
        return float('-inf')

    cos_sim = np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))
    
    return cos_sim

In [4]:
v1 = np.asarray([1, 2])
v2 = np.asarray([1, 3])
cosine_sim(v1, v2)

0.9899494936611664

In [5]:
def get_k_relevant(k, query, D):
    """returns ranked list of top k documents in descending order of their
    cosine similarity with the query
        
    Parameters
    ----------
    k : int
        number of documents to retrieve (top k)
    query : numpy.ndarray
        vector representation of query whose cosine similarity is to be computed with the corpus
    D: list of numpy.ndarray
        vector representation of all documents in corpus
    
    Returns
    -------
    ranked_sims: list of tuples (cosine similarity, index of document)
        list of top k cosine similarities and the corresponding documents (their index) in descending order
    """
      
    cosine_sims = []
    
    for i, d in enumerate(D):
        cosine_sims.append((cosine_sim(query, d), i))
        
    ranked_sims = sorted(cosine_sims, key=lambda x: x[0], reverse=True)
    
    if k != 0:
        # if k=0 retrieve all documents in descending order
        ranked_sims = ranked_sims[:k]
    
    return ranked_sims

In [9]:
def get_over_thresh(thresh, query, D):
    """returns ranked list of top k documents in descending order of their
    cosine similarity with the query
        
    Parameters
    ----------
    thresh : numpy.float64
        minimum similarity that returned documents should have
    query : numpy.ndarray
        vector representation of query whose cosine similarity is to be computed with the corpus
    D: list of numpy.ndarray
        vector representation of all documents in corpus
    
    Returns
    -------
    ranked_sims: list of tuples (cosine similarity, index of document)
        list of cosine similarities (greater than thresh) and the corresponding documents (their index) in descending order
    """
      
    cosine_sims = []
    
    for i, d in enumerate(D):
        cosine_sims.append((cosine_sim(query, d), i))
        
    ranked_sims = sorted(cosine_sims, key=lambda x: x[0], reverse=True)
    
    if thresh != 1:
        # if thresh=1 retrieve all documents in descending order
        ranked_sims = [elem for elem in ranked_sims if elem[0]>=thresh ]
    
    return ranked_sims

In [10]:
v3 = np.asarray([2,3])
D = [v1, v2, v3]
query = np.asarray([3,4])
print("get 2 top: ", get_k_relevant(2, query, D))
print("get all: ", get_k_relevant(0, query, D))

thresh = 0.95
print("threshold ",thresh, get_over_thresh(thresh, query, D))

get 2 top:  [(0.9984603532054125, 2), (0.9838699100999074, 0)]
get all:  [(0.9984603532054125, 2), (0.9838699100999074, 0), (0.9486832980505138, 1)]
threshold  0.95 [(0.9984603532054125, 2), (0.9838699100999074, 0)]
