# Compute Similarity Matrix

Aim: see if I can compute the similarity matrix on my computer or not, find alternatieves.  

In [1]:
# general:
import numpy as np
import pandas as pd

import time # will be use to choose the faster solution

# NLP:
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import word2vec


In [2]:
# load data:
filepath = '../data/descriptions_tokenized.csv'
descriptions = pd.read_csv(filepath)
descriptions = descriptions.iloc[:,1] # the first is missing, we do not care for now.

In [3]:
# train the word2vec model on the wine descriptions corpus:
time0 = time.time()

custom_wine_word2vec_model = word2vec.Word2Vec(
    descriptions,
    workers=8,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=2,   # Minimum word count threshold. ! need to be the same as for tokenize_and_clean
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('Done! It took ', time.time()-time0, 'sec.')

Done! It took  52.47818994522095 sec.


In [4]:
# We average the word vectors to get one vector per description.
time0 = time.time()
descr_vectors = [None] * descriptions.shape[0]
for i, desc in enumerate(descriptions):
    # average vector for each word to get 1 vector per description:
    descr_vectors[i] = np.mean([custom_wine_word2vec_model.wv[word] for word in descriptions[0]], axis=0) # axis needed to keep 300 dim.
print('done in sec: ', time.time()-time0)

done in sec:  43.875948905944824


In [5]:
# create features names
features_names = list(range(0, len(descr_vectors[0])))
features_names = ['f'+str(i) for i in features_names]

time0 = time.time()

# create a dataframe with features (word2vec vectors) as columns and wine indexes as index
features = pd.DataFrame(descr_vectors, columns=features_names, index=descriptions.index)

print('done in sec: ', time.time()-time0)

done in sec:  17.984060049057007


In [6]:
filepath = '../data/features_custom_word2vec.csv'
features.to_csv(path_or_buf=filepath, header=True, index=False)

In [7]:
features = pd.read_csv(filepath)

In [9]:
features

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f290,f291,f292,f293,f294,f295,f296,f297,f298,f299
0,0.688963,0.1221,0.385653,0.175303,-0.616101,-0.521223,-0.638454,-0.639173,-0.090657,0.095035,...,-0.009683,0.325313,-0.027015,-0.10059,-0.528783,-0.216132,-0.163501,-0.056147,0.386059,-0.232195
1,0.688963,0.1221,0.385653,0.175303,-0.616101,-0.521223,-0.638454,-0.639173,-0.090657,0.095035,...,-0.009683,0.325313,-0.027015,-0.10059,-0.528783,-0.216132,-0.163501,-0.056147,0.386059,-0.232195
2,0.688963,0.1221,0.385653,0.175303,-0.616101,-0.521223,-0.638454,-0.639173,-0.090657,0.095035,...,-0.009683,0.325313,-0.027015,-0.10059,-0.528783,-0.216132,-0.163501,-0.056147,0.386059,-0.232195
3,0.688963,0.1221,0.385653,0.175303,-0.616101,-0.521223,-0.638454,-0.639173,-0.090657,0.095035,...,-0.009683,0.325313,-0.027015,-0.10059,-0.528783,-0.216132,-0.163501,-0.056147,0.386059,-0.232195
4,0.688963,0.1221,0.385653,0.175303,-0.616101,-0.521223,-0.638454,-0.639173,-0.090657,0.095035,...,-0.009683,0.325313,-0.027015,-0.10059,-0.528783,-0.216132,-0.163501,-0.056147,0.386059,-0.232195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141611,0.688963,0.1221,0.385653,0.175303,-0.616101,-0.521223,-0.638454,-0.639173,-0.090657,0.095035,...,-0.009683,0.325313,-0.027015,-0.10059,-0.528783,-0.216132,-0.163501,-0.056147,0.386059,-0.232195
141612,0.688963,0.1221,0.385653,0.175303,-0.616101,-0.521223,-0.638454,-0.639173,-0.090657,0.095035,...,-0.009683,0.325313,-0.027015,-0.10059,-0.528783,-0.216132,-0.163501,-0.056147,0.386059,-0.232195
141613,0.688963,0.1221,0.385653,0.175303,-0.616101,-0.521223,-0.638454,-0.639173,-0.090657,0.095035,...,-0.009683,0.325313,-0.027015,-0.10059,-0.528783,-0.216132,-0.163501,-0.056147,0.386059,-0.232195
141614,0.688963,0.1221,0.385653,0.175303,-0.616101,-0.521223,-0.638454,-0.639173,-0.090657,0.095035,...,-0.009683,0.325313,-0.027015,-0.10059,-0.528783,-0.216132,-0.163501,-0.056147,0.386059,-0.232195


In [None]:
time0 = time.time()

# generating the cosine similarity matrix
cos_sim_mat = cosine_similarity(features) # 141617X141617

# dataframe with the similarity matrix, and have the index of the wines as column names and indexes:
df_cos_sim_mat = pd.DataFrame(cos_sim_mat, columns=descriptions.index, index=descriptions.index)
 
print('done in sec: ', time.time()-time0)

In [10]:
from scipy import sparse

A_sparse = sparse.csr_matrix(features)

similarities = cosine_similarity(A_sparse)
print('pairwise dense output:\n {}\n'.format(similarities))

# #also can output sparse matrices
# similarities_sparse = cosine_similarity(A_sparse,dense_output=False)
# print('pairwise sparse output:\n {}\n'.format(similarities_sparse))