In [20]:
import pandas as pd 
import numpy as np 
import copy

from itertools import product

## Compute similarity between each movies 

In [3]:
tfidf = np.load('./output/reduced_tfidf.npy')
tfidf.shape

(8230, 100)

In [38]:
def cosine_similarity(A): 
    product = np.dot(A, A.T)
    p = np.sqrt(np.sum(A**2,axis=1))
    p1 = p[..., np.newaxis]
    p2 = p[np.newaxis, ...] 
    return product / (p1*p2)

In [47]:
%timeit cosine_similarity(tfidf)

723 ms ± 119 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
similarity_matrix = cosine_similarity(tfidf)
similarity_matrix.shape

(8230, 8230)

In [13]:
np.save('./output/similarity_matrix', similarity_matrix)

In [10]:
similarity_matrix[0][similarity_matrix[0] > 0]

array([1.        , 0.00944507, 0.01010369, ..., 0.01661981, 0.01631498,
       0.02299849])

## Cython

In [4]:
%load_ext Cython

In [48]:
%%cython 
from math import sqrt
import numpy as np
cimport numpy as np
def cosine_similarity_cython(np.ndarray[float, ndim=2] A): 
    # dimensions of the tfidf matrix
    cdef int Ax_max = A.shape[0]
    cdef int Ay_max = A.shape[1]
    
    # allocate the result matrix
    cdef np.ndarray product = np.zeros([Ax_max, Ax_max], dtype=float)
    cdef np.ndarray p = np.zeros(Ax_max, dtype=float)
    
    product = np.dot(A, A.T)
    p = np.sqrt(np.sum(A**2,axis=1))
    p1 = p[..., np.newaxis]
    p2 = p[np.newaxis, ...] 
    return product / (p1*p2)

In [50]:
%timeit cosine_similarity_cython(tfidf)

646 ms ± 139 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
