# Book recommendations based on similarity using Word2Vec model

In [1]:
import pandas as pd
import numpy as np
import pandas as pd
from scipy.stats import itemfreq
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#set display width
pd.set_option('display.max_colwidth', -1)

In [2]:
#Install gensim
#!pip install gensim

### Understanding word embeddings in word2vec

In [3]:
#Keyedvectors.load_word2vec_format is used to load pre-trained word2vec models
from gensim.models.keyedvectors import KeyedVectors
w2vec = KeyedVectors.load_word2vec_format("glove.6B.50d_orig.txt", binary=False)

In [4]:
#50 legth word embedding vector for the word 'cat'
w2vec.word_vec("cat")

array([ 0.45281 , -0.50108 , -0.53714 , -0.015697,  0.22191 ,  0.54602 ,
       -0.67301 , -0.6891  ,  0.63493 , -0.19726 ,  0.33685 ,  0.7735  ,
        0.90094 ,  0.38488 ,  0.38367 ,  0.2657  , -0.08057 ,  0.61089 ,
       -1.2894  , -0.22313 , -0.61578 ,  0.21697 ,  0.35614 ,  0.44499 ,
        0.60885 , -1.1633  , -1.1579  ,  0.36118 ,  0.10466 , -0.78325 ,
        1.4352  ,  0.18629 , -0.26112 ,  0.83275 , -0.23123 ,  0.32481 ,
        0.14485 , -0.44552 ,  0.33497 , -0.95946 , -0.097479,  0.48138 ,
       -0.43352 ,  0.69455 ,  0.91043 , -0.28173 ,  0.41637 , -1.2609  ,
        0.71278 ,  0.23782 ], dtype=float32)

In [5]:
#50 legth word embedding vector for the word 'dog'
w2vec.word_vec('dog')

array([ 0.11008  , -0.38781  , -0.57615  , -0.27714  ,  0.70521  ,
        0.53994  , -1.0786   , -0.40146  ,  1.1504   , -0.5678   ,
        0.0038977,  0.52878  ,  0.64561  ,  0.47262  ,  0.48549  ,
       -0.18407  ,  0.1801   ,  0.91397  , -1.1979   , -0.5778   ,
       -0.37985  ,  0.33606  ,  0.772    ,  0.75555  ,  0.45506  ,
       -1.7671   , -1.0503   ,  0.42566  ,  0.41893  , -0.68327  ,
        1.5673   ,  0.27685  , -0.61708  ,  0.64638  , -0.076996 ,
        0.37118  ,  0.1308   , -0.45137  ,  0.25398  , -0.74392  ,
       -0.086199 ,  0.24068  , -0.64819  ,  0.83549  ,  1.2502   ,
       -0.51379  ,  0.04224  , -0.88118  ,  0.7158   ,  0.38519  ],
      dtype=float32)

In [6]:
#50 legth word embedding vector can also be obtained by searching the model using key as the word
w2vec['india']

array([-0.20356 , -0.8707  , -0.19172 ,  0.73862 ,  0.18494 ,  0.14926 ,
        0.48079 , -0.21633 ,  0.72753 , -0.36912 ,  0.13397 , -0.1143  ,
       -0.18075 , -0.64683 , -0.18484 ,  0.83575 ,  0.48179 ,  0.76026 ,
       -0.50381 ,  0.80743 ,  1.2195  ,  0.3459  ,  0.22185 ,  0.31335 ,
        1.2066  , -1.8441  ,  0.14064 , -0.99715 , -1.1402  ,  0.32342 ,
        3.2128  ,  0.42708 ,  0.19504 ,  0.80113 ,  0.38555 , -0.12568 ,
       -0.26533 ,  0.055264, -1.1557  ,  0.16836 , -0.82228 ,  0.20394 ,
        0.089235, -0.60125 , -0.032878,  1.3735  , -0.51661 ,  0.29611 ,
        0.23951 , -1.3801  ], dtype=float32)

In [7]:
a = w2vec.word_vec("cat")
b = w2vec.word_vec("dog")
c = w2vec.word_vec("pen")
d = w2vec.word_vec("elephant")

In [8]:
#Find similarity between two words using correlation between word embeddings
from scipy.stats.stats import pearsonr
pearsonr(a,b) #cat and dog are more similar

(0.92137766, 2.47113394399376e-21)

In [9]:
#Find similarity between two words using correlation between word embeddings
pearsonr(a,c), pearsonr(b,c), pearsonr(a,d), pearsonr(b,d), pearsonr(c,d) #all these are less similar

((0.5345575, 6.366049389778326e-05),
 (0.50545216, 0.00018109233218871945),
 (0.73933697, 8.67745931811567e-10),
 (0.7144916, 5.638778596712443e-09),
 (0.27897468, 0.049774298949480114))

In [10]:
# Most similar words to a given word using model.most_similar()
w2vec.most_similar("cat")

[('dog', 0.9218006134033203),
 ('rabbit', 0.8487820625305176),
 ('monkey', 0.8041081428527832),
 ('rat', 0.7891963124275208),
 ('cats', 0.7865270376205444),
 ('snake', 0.7798910737037659),
 ('dogs', 0.7795814871788025),
 ('pet', 0.7792249917984009),
 ('mouse', 0.7731667757034302),
 ('bite', 0.7728800773620605)]

In [11]:
# word which is not similar among a group of words
w2vec.doesnt_match("pen car pencil ink".split())

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'car'

In [12]:
# similar_by_word is same as most_similar
w2vec.similar_by_word("python")

[('reticulated', 0.6916365027427673),
 ('spamalot', 0.6635735630989075),
 ('php', 0.6414496898651123),
 ('owl', 0.6301496028900146),
 ('mouse', 0.6275478005409241),
 ('reticulatus', 0.6274471282958984),
 ('perl', 0.6267575621604919),
 ('monkey', 0.620721161365509),
 ('monty', 0.6079354286193848),
 ('scripting', 0.6041731834411621)]

In [13]:
# similarity method gives the cosine similarity between two words
w2vec.similarity('apple', 'mango')

0.61270845

In [14]:
#check if a word is present in the word2vec model using model name
word='Madras'
if word in w2vec:
    print('{0} is in the model'.format(word))
else:
    print('{0} is NOT in the model'.format(word))

Madras is NOT in the model


In [15]:
#check if a word is present in the word2vec model using model.vocab
word='Bhagyanagar'
if word in w2vec.vocab:
    print('{0} is in the model'.format(word))
else:
    print('{0} is NOT in the model'.format(word))

Bhagyanagar is NOT in the model


### Finding sentence vectors from word vectors

In [16]:
title1 = "laws of physics"
title2 = "theory of relativity"
title3 = "battle of panipat"

In [17]:
from nltk import word_tokenize

In [18]:
np.array([w2vec.word_vec(x) for x in word_tokenize(title1) if x in w2vec.vocab]).shape

(3, 50)

In [19]:
[w2vec.word_vec(x) for x in word_tokenize(title1) if x in w2vec.vocab ]

[array([-1.1832  , -0.91248 , -1.0874  , -0.52611 , -0.59133 ,  0.90597 ,
         0.23737 , -1.2453  ,  0.31221 , -0.18614 ,  0.047556,  0.66195 ,
         0.034929, -0.1288  ,  0.38821 ,  0.17066 , -0.13575 , -1.0389  ,
         0.75063 , -0.41731 ,  0.71746 , -0.13975 , -0.54515 , -0.12073 ,
        -0.40956 , -2.349   ,  0.065296, -0.82423 ,  0.51494 , -0.13903 ,
         2.6371  , -0.28108 , -1.3046  , -1.2122  , -0.24492 , -0.25433 ,
         0.3531  , -0.6155  , -0.5967  ,  0.52287 , -0.81452 , -0.034609,
         1.631   ,  1.4932  , -0.77162 ,  0.065482, -0.56947 ,  0.19967 ,
         0.82442 , -0.28176 ], dtype=float32),
 array([ 0.70853  ,  0.57088  , -0.4716   ,  0.18048  ,  0.54449  ,
         0.72603  ,  0.18157  , -0.52393  ,  0.10381  , -0.17566  ,
         0.078852 , -0.36216  , -0.11829  , -0.83336  ,  0.11917  ,
        -0.16605  ,  0.061555 , -0.012719 , -0.56623  ,  0.013616 ,
         0.22851  , -0.14396  , -0.067549 , -0.38157  , -0.23698  ,
        -1.7037   , -

In [20]:
np.array([w2vec.word_vec(x) for x in word_tokenize(title1) if x in w2vec.vocab]).mean(axis=0).shape

(50,)

In [21]:
np.array([w2vec.word_vec(x) for x in word_tokenize(title1) if x in w2vec.vocab]).mean(axis=0)

array([-0.41165003,  0.32106668, -0.548472  , -0.14604734, -0.08014334,
        0.57921   ,  0.37322664, -0.91621333, -0.05222334,  0.10800999,
        0.34717265,  0.19584   , -0.19510369, -0.09276666,  0.13078   ,
        0.02793   ,  0.06941167,  0.11746033, -0.28263333, -0.18569799,
        0.6268566 , -0.00835667, -0.02554965, -0.31514   ,  0.05773   ,
       -1.7224334 , -0.505338  , -0.62672   , -0.25312   ,  0.14738333,
        2.6731665 , -0.52106   , -0.79387   , -1.3824034 ,  0.04651667,
       -0.14059454, -0.06331   ,  0.31441465,  0.33561668,  0.6332434 ,
       -0.31546   , -0.08330967,  0.656381  ,  0.799454  , -0.5040233 ,
        0.28109732,  0.25718665,  0.24763334,  0.06121065, -0.43976998],
      dtype=float32)

### Find vector representation for each title by splitting the title into indivdiual words. 
### Find vector for each word and average them. 

In [22]:
title1_vec = np.array([w2vec.word_vec(x) for x in word_tokenize(title1) if x in w2vec.vocab]).mean(axis=0)
title2_vec = np.array([w2vec.word_vec(x) for x in word_tokenize(title2) if x in w2vec.vocab]).mean(axis=0)
title3_vec = np.array([w2vec.word_vec(x) for x in word_tokenize(title3) if x in w2vec.vocab]).mean(axis=0)

In [23]:
print(title1_vec.shape, title2_vec.shape, title3_vec.shape)

(50,) (50,) (50,)


### FInd the pearson correlation between different titles

In [24]:
pearsonr(title1_vec,title2_vec) # title1 and title2 are very similar

(0.8310373, 8.078668933786241e-14)

In [25]:
pearsonr(title1_vec,title3_vec) # title1 and title3 are not similar

(0.49981984, 0.00021934156127459731)

In [26]:
pearsonr(title2_vec,title3_vec) # title2 and title3 are not similar

(0.48349163, 0.0003753143672253442)

### Book Recommendations - Finding  most similiar books

In [27]:
books_data = pd.read_csv('books.csv',encoding="latin")

In [28]:
books_data.shape

(5700, 2)

In [29]:
books_data.columns

Index(['title', 'category'], dtype='object')

In [30]:
books_data.head(10)

Unnamed: 0,title,category
0,"Oral and Maxillofacial Surgery: An Objective-Based Textbook, 2e",Medical Books
1,"Barron's GRE, 21st Edition",Test Preparation
2,George Balanchine: The Ballet Maker (Eminent Lives),Biographies & Memoirs
3,"A Partner in Holiness: Deepening Mindfulness, Practicing Compassion and Enriching Our Lives Through the Wisdom of R. Levi Yitzhak of Berdichev's, Vol. 2 (Institute for Jewish Spirituality)",Religion & Spirituality
4,Construction Scheduling: Principles and Practices (2nd Edition),Arts & Photography
5,"Literature and Its Writers: A Compact Introduction to Fiction, Poetry, and Drama",Literature & Fiction
6,Straight on Till Morning: The Life of Beryl Markham,Engineering & Transportation
7,Diagrammatica: The Path to Feynman Diagrams (Cambridge Lecture Notes in Physics),Science & Math
8,Book of Common Prayer 1979: Large Print edition,Christian Books & Bibles
9,A Handful of Stars,Children's Books


In [31]:
#clean the title column, convert all to lower case and remove non alpha characters

import re
books_data["title_clean"] = [re.sub("[^a-zA-Z ]","",x).lower() for x in books_data["title"]]

In [32]:
books_data[["title","title_clean"]].head(10)

Unnamed: 0,title,title_clean
0,"Oral and Maxillofacial Surgery: An Objective-Based Textbook, 2e",oral and maxillofacial surgery an objectivebased textbook e
1,"Barron's GRE, 21st Edition",barrons gre st edition
2,George Balanchine: The Ballet Maker (Eminent Lives),george balanchine the ballet maker eminent lives
3,"A Partner in Holiness: Deepening Mindfulness, Practicing Compassion and Enriching Our Lives Through the Wisdom of R. Levi Yitzhak of Berdichev's, Vol. 2 (Institute for Jewish Spirituality)",a partner in holiness deepening mindfulness practicing compassion and enriching our lives through the wisdom of r levi yitzhak of berdichevs vol institute for jewish spirituality
4,Construction Scheduling: Principles and Practices (2nd Edition),construction scheduling principles and practices nd edition
5,"Literature and Its Writers: A Compact Introduction to Fiction, Poetry, and Drama",literature and its writers a compact introduction to fiction poetry and drama
6,Straight on Till Morning: The Life of Beryl Markham,straight on till morning the life of beryl markham
7,Diagrammatica: The Path to Feynman Diagrams (Cambridge Lecture Notes in Physics),diagrammatica the path to feynman diagrams cambridge lecture notes in physics
8,Book of Common Prayer 1979: Large Print edition,book of common prayer large print edition
9,A Handful of Stars,a handful of stars


In [33]:
#finding Word Vector representation of all book titles using w2vec

from tqdm import tqdm

title_vec = np.zeros((books_data.shape[0],50))
#title_vec = np.zeros((100,50))
#print(title_vec[0])

for i in tqdm(range(0,books_data.shape[0])):
#for i in tqdm(range(0,100)):

    words = books_data["title_clean"].iloc[i].split(" ")
    #print(words)
    
    words = [x.strip() for x in words]
    #print(words)
    
    ind_word_vecs = [w2vec.word_vec(x) for x in words if x in w2vec.vocab]
    #print(len(ind_word_vecs), len(ind_word_vecs[0]))
    
    title_vec[i] = np.array(ind_word_vecs).mean(axis=0)
    #print(title_vec[i], '\n\n')

  ret = ret.dtype.type(ret / rcount)
100%|███████████████████████████████████████████████████████████████████████████| 5700/5700 [00:00<00:00, 16546.94it/s]


In [34]:
#title-vec contains the word2vec representation of all book titles
#word2vec representation for a title is obtained by finding the mean of word embedding of each word in the title
#title_vec
print(title_vec.shape)

(5700, 50)


In [35]:
# convert any nan values to zeros
title_vec = np.nan_to_num(title_vec)

In [95]:
#cosine_similarity  method is to find the similarity among all the titles in the books data

from sklearn.metrics.pairwise import cosine_similarity

In [37]:
#Cosine similarity between each book title with all other book titles

cosine_sim_titles = cosine_similarity(title_vec)

In [96]:
#shape of cosine_similarity matrix

cosine_sim_titles.shape

(5700, 5700)

In [39]:
#cosine similarity matrix

cosine_sim_titles

array([[1.        , 0.34390155, 0.5948034 , ..., 0.5798488 , 0.57417283,
        0.45775519],
       [0.34390155, 1.        , 0.25448263, ..., 0.17993782, 0.30491366,
        0.31002894],
       [0.5948034 , 0.25448263, 1.        , ..., 0.65772461, 0.59280549,
        0.45846159],
       ...,
       [0.5798488 , 0.17993782, 0.65772461, ..., 1.        , 0.46512767,
        0.49529352],
       [0.57417283, 0.30491366, 0.59280549, ..., 0.46512767, 1.        ,
        0.50004542],
       [0.45775519, 0.31002894, 0.45846159, ..., 0.49529352, 0.50004542,
        1.        ]])

In [112]:
# List the first 5 books

books_data['title_clean'].head()

0    oral and maxillofacial surgery an objectivebased textbook e                                                                                                                         
1    barrons gre st edition                                                                                                                                                              
2    george balanchine the ballet maker eminent lives                                                                                                                                    
3    a partner in holiness deepening mindfulness practicing compassion and enriching our lives through the wisdom of r levi yitzhak of berdichevs vol   institute for jewish spirituality
4    construction scheduling principles and practices nd edition                                                                                                                         
Name: title_clean, dtype: object

#### Most similiar books corresponding to title id 0 - "Oral and Maxillofacial Surgery: An Objective-Based Textbook, 2e".

In [113]:
# get books similiar to a given title.

title_id = 0
books_data['title_clean'].iloc[title_id]

'oral and maxillofacial surgery an objectivebased textbook e'

In [42]:
# get index of top 10 book titles, which are similar to title_id 0

top_n_idx = np.flip(np.argsort(cosine_sim_titles[title_id,]))[0:10]
print(top_n_idx)

[   0 1351  119   68 1108 1192  249 4645 1625 2430]


In [114]:
# Matching books fot title_id 0

books_data['title_clean'].iloc[top_n_idx]

34      shadowrun third edition fpr                                                                                                                                                       
4879    shadowrun th edition                                                                                                                                                              
2795    shadowrun unwired shadowrun catalyst hardcover                                                                                                                                    
4660    rigger  a shadowrun sourcebook                                                                                                                                                    
3956    minecraft pocket edition the minecraft pocket edition essentials handbook guide to minecraft an unofficial minecraft pocket edition handbook  edition minecraft handbook minecraft
274     sixth world almanac shadowrun catalyst hardcover         

#### Most similiar books corresponding to title id 34 - "Shadowrun: Third Edition (FPR25000)". 

In [115]:
# get books similiar to a given title.

title_id = 34
books_data['title_clean'].iloc[title_id]

'shadowrun third edition fpr'

In [45]:
# get index of top 10 book titles, which are similar to title_id 34

top_n_idx = np.flip(np.argsort(cosine_sim_titles[title_id,]))[0:10]
top_n_idx

array([  34, 4879, 4070, 3956, 1857,  274, 5609, 5158, 5098, 3500],
      dtype=int64)

In [116]:
# Matching books fot title_id 34

books_data['title_clean'].iloc[top_n_idx]

34      shadowrun third edition fpr                                                                                                                                                       
4879    shadowrun th edition                                                                                                                                                              
2795    shadowrun unwired shadowrun catalyst hardcover                                                                                                                                    
4660    rigger  a shadowrun sourcebook                                                                                                                                                    
3956    minecraft pocket edition the minecraft pocket edition essentials handbook guide to minecraft an unofficial minecraft pocket edition handbook  edition minecraft handbook minecraft
274     sixth world almanac shadowrun catalyst hardcover         

#### Most similiar books corresponding to title id 1 - "Barron's GRE, 21st Edition". 

In [117]:
# get books similiar to a given title.

title_id = 1
books_data['title_clean'].iloc[title_id]

'barrons gre st edition'

In [48]:
# get index of top 10 book titles, which are similar to title_id 1

top_n_idx = np.flip(np.argsort(cosine_sim_titles[title_id,]))[0:10]
top_n_idx

array([   1, 1790, 4755, 3500, 3297, 4127, 5270, 2176,   86, 2284],
      dtype=int64)

In [118]:
# Matching books fot title_id 1

books_data['title_clean'].iloc[top_n_idx]

34      shadowrun third edition fpr                                                                                                                                                       
4879    shadowrun th edition                                                                                                                                                              
2795    shadowrun unwired shadowrun catalyst hardcover                                                                                                                                    
4660    rigger  a shadowrun sourcebook                                                                                                                                                    
3956    minecraft pocket edition the minecraft pocket edition essentials handbook guide to minecraft an unofficial minecraft pocket edition handbook  edition minecraft handbook minecraft
274     sixth world almanac shadowrun catalyst hardcover         

### Issues with the results obtained using word2vec approach :
- if one of the words in the title is present in another word then even if the titles do not matching the tiles are selected as most similar ones because of high cosine similarity
- also, most frequently occuring words have more cosine similarity value, though they are not importtant
- to overcome this, we use tf-idf weights of each word and then multiply the tf-idf weights with the cosine similarity weights.
- With this method we get a weighted cosine similarity between the titles whcich improves the accuracy of recommendations

### Using TF IDF with word2vec

In [52]:
#TfidfTransformer directly transforms the word tokens to tf-idf weights without counting them using count-vectorizer

from sklearn.feature_extraction.text import TfidfTransformer

In [53]:
# initialize vectorizer

vect = TfidfVectorizer(ngram_range=(1,2),stop_words='english', max_features=5000)

In [54]:
# fit tf-df transformer and transform the tokens to tf-idf weights

vect.fit(books_data['title_clean'])
title_matrix = vect.transform(books_data['title_clean'])

In [55]:
#title_matrix contains the tfidf weights of all the book titles

title_matrix = title_matrix.toarray()

In [57]:
#shape of title_matrix , no of cols = no of features, no of rows = no of titles

title_matrix.shape

(5700, 5000)

In [59]:
#title_matrix is very saparse

title_matrix[0:3]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [60]:
# get tf idf weights for title_id 34

idx = 34
print(np.where(title_matrix[idx,:] > 0)[0])

features = np.where(title_matrix[idx,:] > 0)[0]

[1290 4110]


In [65]:
# indices of tf-idf weights which have wt greater than 0 for title_id 34

print(features)

[1290 4110]


In [68]:
# words which have wts greater than 0 for title_id 34

print(vect.get_feature_names()[1290], vect.get_feature_names()[4110])

edition shadowrun


In [69]:
# get the words which have wts greater than 0 for title_id 34

feature_names = [vect.get_feature_names()[x] for x in features]
print(feature_names)

['edition', 'shadowrun']


#### Find the tf-idf weights of the 2 words. and check which word has a higher weightage

In [70]:
# tfidf wt of the word 'edition'.
# Though the word edition is frequently occuring it's wt is less

title_matrix[34,1290]

0.4337506779996424

In [72]:
# tfidf wt of the word 'shadowrun'.
#Though the word shadow is less frequent,it's wt is more compared to edition, which is more frequent

title_matrix[34,4110]

0.901032934656026

In [73]:
# find the tf-idf weights of all the words in the features

print(np.array([title_matrix[idx,x] for x in features]).shape)
print(np.array([title_matrix[idx,x] for x in features]))

(2,)
[0.43375068 0.90103293]


In [75]:
# convert the feature weights matrix into a 2d matrix by adding a column using newaxis
feature_weights = np.array([title_matrix[idx,x] for x in features])[:,np.newaxis]
print(feature_weights.shape)
print(feature_weights)

(2, 1)
[[0.43375068]
 [0.90103293]]


In [78]:
# Finding the word2vec vector representations of the words 'edition and 'shadowrun'
# if there there is no word2vec representation for a word then add a 50 length vector embedding with zeros
# this is done to make the sahape compatiple for multipication with tf-idf weights vector

word_vecs = np.array([w2vec.word_vec(x) if x in w2vec.vocab else np.zeros(50) for x in feature_names])
print(word_vecs.shape)
print(word_vecs)

(2, 50)
[[-7.1063e-01  4.0502e-01 -5.9437e-01 -1.0819e-01  9.8476e-02 -7.5128e-01
  -1.2651e+00 -1.3880e+00  5.3070e-01 -2.6628e-01  1.0393e-01  1.0034e-02
  -4.0718e-01  5.2769e-02  1.5388e+00 -8.8403e-01 -1.2984e+00 -2.3950e-01
  -9.2125e-01  2.4364e-01  6.9175e-01 -8.5089e-01  3.8458e-01  6.3480e-01
  -1.0589e-03 -3.8813e-01 -1.7763e+00 -4.4471e-02 -9.1733e-01  1.4402e-01
   2.3725e+00 -9.8377e-01 -3.1709e-01  4.9419e-01 -3.3921e-01  3.3320e-01
   1.2157e+00  5.1124e-01 -1.0985e+00 -9.3145e-01  1.1839e+00 -8.1248e-01
  -6.0077e-01 -8.1775e-01 -6.1546e-01  1.0617e+00  7.2965e-01  4.2010e-03
   1.7508e-02 -1.3483e-02]
 [-9.3703e-01 -1.1261e+00 -5.4092e-01  9.8129e-01 -1.3505e-01 -9.2468e-01
   3.2807e-01 -4.0589e-01  7.9908e-01  7.7101e-01 -1.0707e-01  9.5938e-01
   1.2107e-01 -6.5970e-01  6.0097e-01 -7.5536e-01 -5.0945e-02  1.9605e-01
  -4.8701e-01  7.1163e-01  1.4351e-01 -5.2897e-01  2.6948e-01  3.7200e-02
   6.2989e-01  1.3930e+00 -2.8734e-01 -1.2098e-03  8.9359e-01 -8.3085e-01
  -

In [80]:
# multiplying tf idf weights with word2vec

res = word_vecs*feature_weights
print(res.shape)
print(res)

(2, 50)
[[-3.08236244e-01  1.75677699e-01 -2.57808394e-01 -4.69274859e-02
   4.27140318e-02 -3.25868214e-01 -5.48737984e-01 -6.02045946e-01
   2.30191497e-01 -1.15499129e-01  4.50797064e-02  4.35225412e-03
  -1.76614606e-01  2.28885902e-02  6.67455544e-01 -3.83448605e-01
  -5.63181900e-01 -1.03883288e-01 -3.99592806e-01  1.05679018e-01
   3.00047027e-01 -3.69074106e-01  1.66811830e-01  2.75344938e-01
  -4.59298606e-04 -1.68351655e-01 -7.70471309e-01 -1.92893261e-02
  -3.97892521e-01  6.24687753e-02  1.02907346e+00 -4.26710910e-01
  -1.37538005e-01  2.14355251e-01 -1.47132569e-01  1.44525729e-01
   5.27310712e-01  2.21750699e-01 -4.76475126e-01 -4.04017073e-01
   5.13517427e-01 -3.52413739e-01 -2.60584393e-01 -3.54699607e-01
  -2.66956183e-01  4.60513089e-01  3.16486191e-01  1.82218664e-03
   7.59410693e-03 -5.84826039e-03]
 [-8.44294906e-01 -1.01465314e+00 -4.87386752e-01  8.84174593e-01
  -1.21684497e-01 -8.33167129e-01  2.95601888e-01 -3.65720247e-01
   7.19997410e-01  6.94705386e-01

In [81]:
#find mean of the weighted word2vec embeddings

res_mean = res.mean(axis=0)
print(res_mean.shape)
print(res_mean)

(50,)
[-0.57626558 -0.41948772 -0.37259757  0.41862355 -0.03948523 -0.57951767
 -0.12656805 -0.4838831   0.47509445  0.28960313 -0.02569694  0.4343926
 -0.03376328 -0.28576141  0.60447464 -0.53202642 -0.30454251  0.03638211
 -0.41920243  0.37344054  0.21467713 -0.42284675  0.20481109  0.15443168
  0.28354618  0.54339361 -0.51468705 -0.0101897   0.20363074 -0.34307722
  0.30482132 -0.2662551   0.27964693  0.16325792 -0.33705536  0.46035577
  0.13280285 -0.47529163 -0.74186993 -0.4624566   0.19922776 -0.60777461
 -0.4135409  -0.15019222 -0.34977105  0.08852407  0.55739619  0.37920976
  0.22442397  0.03506882]


In [83]:
#Function to get tf-idf weights, word2vec weights , multipy them and get mean of weighted word2vec embedings
# get feature_weigths
# get word2vec embeddings
# multipy feature_weigths with word2vec embeddings
# return mean of weighted word2vec embedings

def get_weighted_vectors(idx):
    
    features = np.where(title_matrix[idx,:] > 0)[0]
    feature_names = [vect.get_feature_names()[x] for x in features]
    feature_weights = np.array([title_matrix[idx,x] for x in features])[:,np.newaxis]
    word_vecs = np.array([w2vec.word_vec(x) if x in w2vec.vocab else np.zeros(50) for x in feature_names])
    res = word_vecs*feature_weights
    return res.mean(axis=0)   

In [84]:
#finding combined tfidf and Word Vector representation of all book titles using w2vec

from tqdm import tqdm

title_vec_weighted = np.zeros((books_data.shape[0],50))

for i in tqdm(range(0,books_data.shape[0])):
    vec = get_weighted_vectors(i)
    
    if vec.shape[0] == 0:
        title_vec_weighted[i] = np.zeros(50)
    else:
        title_vec_weighted[i] = vec

  
100%|██████████████████████████████████████████████████████████████████████████████| 5700/5700 [01:17<00:00, 91.20it/s]


### Using the weighted vectors obtained by multiplying tf-idf weights with word2vec embeddings, find the top 10 most similiar books to a given book title

In [87]:
#shape of array of weighted vectors
title_vec_weighted.shape

(5700, 50)

In [88]:
#each weighted vector has 50 embeddings which are weighted
title_vec_weighted[0]

array([ 0.0407199 ,  0.1032344 , -0.64566988, -0.16534801, -0.32719557,
        0.60530042,  0.06560228, -0.09389795,  0.29427193,  0.28394603,
        0.1678797 , -0.01078232,  0.1588914 ,  0.18683338,  0.12878867,
        0.09225618, -0.65087725, -0.29770656,  0.06181822,  0.38010894,
       -0.04204222,  0.23314077,  0.16140571, -0.23818121,  0.03380487,
       -0.57586438, -0.22967646, -0.21102348, -0.19878487, -0.161578  ,
        1.15137127, -0.17780568, -0.12023563, -0.1977959 ,  0.07819631,
        0.27455632,  0.46582114,  0.57829055,  0.3995944 , -0.00779139,
       -0.01966397, -0.04205391, -0.1061834 ,  0.66156778,  0.23881343,
        0.18094338,  0.52741263,  0.40848348, -0.33696185,  0.19548229])

In [90]:
#replace any nan values with zeros

title_vec = np.nan_to_num(title_vec_weighted)

In [98]:
#cosine_similarity  method is to find the similarity among all the titles in the books data

from sklearn.metrics.pairwise import cosine_similarity

In [99]:
#Consine similarity between each book title with all other book titles

cosine_sim_titles = cosine_similarity(title_vec)

In [100]:
#shape of cosine_similarity matrix

cosine_sim_titles.shape

(5700, 5700)

In [101]:
#cosine similarity matrix

cosine_sim_titles

array([[ 1.        ,  0.21505431,  0.34266771, ...,  0.18887298,
         0.45720803,  0.11730321],
       [ 0.21505431,  1.        ,  0.12757366, ..., -0.16287118,
         0.07269368,  0.15967793],
       [ 0.34266771,  0.12757366,  1.        , ...,  0.3067786 ,
         0.61461677,  0.16725654],
       ...,
       [ 0.18887298, -0.16287118,  0.3067786 , ...,  1.        ,
         0.32518472,  0.09858321],
       [ 0.45720803,  0.07269368,  0.61461677, ...,  0.32518472,
         1.        ,  0.33065311],
       [ 0.11730321,  0.15967793,  0.16725654, ...,  0.09858321,
         0.33065311,  1.        ]])

In [119]:
#list the first 5 books

books_data['title_clean'].head()

0    oral and maxillofacial surgery an objectivebased textbook e                                                                                                                         
1    barrons gre st edition                                                                                                                                                              
2    george balanchine the ballet maker eminent lives                                                                                                                                    
3    a partner in holiness deepening mindfulness practicing compassion and enriching our lives through the wisdom of r levi yitzhak of berdichevs vol   institute for jewish spirituality
4    construction scheduling principles and practices nd edition                                                                                                                         
Name: title_clean, dtype: object

### find the most similiar books to "Oral and Maxillofacial Surgery: An Objective-Based Textbook, 2e"

In [120]:
# get books similiar to a given title..

title_id = 0
books_data['title_clean'].iloc[title_id]

'oral and maxillofacial surgery an objectivebased textbook e'

In [104]:
# get index of top 10 book titles, which are similar to title_id 0

top_n_idx = np.flip(np.argsort(cosine_sim_titles[title_id,]))[0:10]
top_n_idx

array([   0, 1108, 1351, 4645,  249, 1192, 1835,  337,  119, 2430],
      dtype=int64)

In [121]:
# Matching books fot title_id 0

books_data['title_clean'].iloc[top_n_idx]

34      shadowrun third edition fpr                                                                                                                                                       
4879    shadowrun th edition                                                                                                                                                              
2795    shadowrun unwired shadowrun catalyst hardcover                                                                                                                                    
4660    rigger  a shadowrun sourcebook                                                                                                                                                    
3956    minecraft pocket edition the minecraft pocket edition essentials handbook guide to minecraft an unofficial minecraft pocket edition handbook  edition minecraft handbook minecraft
274     sixth world almanac shadowrun catalyst hardcover         

### find the most similiar books to "Barron's GRE, 21st Edition"

In [122]:
# get books similiar to a given title..

title_id = 1
books_data['title_clean'].iloc[title_id]

'barrons gre st edition'

In [107]:
# get index of top 10 book titles, which are similar to title_id 1

top_n_idx = np.flip(np.argsort(cosine_sim_titles[title_id,]))[0:10]
top_n_idx

array([   1, 4755, 1417, 1790,  168, 5029, 5270, 2284, 5042, 3249],
      dtype=int64)

In [123]:
# Matching books fot title_id 1

books_data['title_clean'].iloc[top_n_idx]

34      shadowrun third edition fpr                                                                                                                                                       
4879    shadowrun th edition                                                                                                                                                              
2795    shadowrun unwired shadowrun catalyst hardcover                                                                                                                                    
4660    rigger  a shadowrun sourcebook                                                                                                                                                    
3956    minecraft pocket edition the minecraft pocket edition essentials handbook guide to minecraft an unofficial minecraft pocket edition handbook  edition minecraft handbook minecraft
274     sixth world almanac shadowrun catalyst hardcover         

### find the most similiar books to "Shadowrun: Third Edition (FPR25000)"

In [124]:
# get books similiar to a given title..

title_id = 34
books_data['title_clean'].iloc[title_id]

'shadowrun third edition fpr'

In [110]:
# get index of top 10 book titles, which are similar to title_id 34

top_n_idx = np.flip(np.argsort(cosine_sim_titles[title_id,]))[0:10]
top_n_idx

array([  34, 4879, 2795, 4660, 3956,  274, 4380, 2082, 5167, 3985],
      dtype=int64)

In [125]:
# Matching books fot title_id 34

books_data['title_clean'].iloc[top_n_idx]

34      shadowrun third edition fpr                                                                                                                                                       
4879    shadowrun th edition                                                                                                                                                              
2795    shadowrun unwired shadowrun catalyst hardcover                                                                                                                                    
4660    rigger  a shadowrun sourcebook                                                                                                                                                    
3956    minecraft pocket edition the minecraft pocket edition essentials handbook guide to minecraft an unofficial minecraft pocket edition handbook  edition minecraft handbook minecraft
274     sixth world almanac shadowrun catalyst hardcover         

### A word2vec embedding weighted by tf-idf weights is giving better results than using only a simple word2vec embedding