# Recommendations via Product based Content Filtering : Finding similarity between vectors ie, consine similarity

"""
Created on Fri Sep 2018

@author: Firoz Subair
"""

Approch

Singular-Value Decomposition
The Singular-Value Decomposition, or SVD for short, is a matrix decomposition method for reducing a matrix to its constituent 
parts in order to make certain subsequent matrix calculations simpler.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from typing import List, Dict
import warnings
warnings.filterwarnings('ignore')
from surprise import SVD, NMF, accuracy
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split
from IPython.display import display, HTML, Markdown

    

In [3]:
books = pd.read_csv('Books.csv', sep=',', error_bad_lines=False, encoding="latin-1")

In [4]:
books = books[1:]

In [5]:
books.columns = ['Id','book_id', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'urlId']

In [6]:
users = pd.read_csv('Users.csv', sep=',', error_bad_lines=False, encoding="latin-1")
users.columns = ['Id','userID', 'Location', 'Age']

In [7]:
ratings = pd.read_csv('UserEvents.csv', sep=',', error_bad_lines=False, encoding="latin-1")
ratings.columns = ['ratingId','userID', 'book_id', 'bookRating']

In [10]:
ratings.bookRating = ratings.bookRating.astype('category')

In [11]:
#ratings.bookRating = ratings.bookRating.cat.codes
ratingDic ={"dislike":0, "view":1, "interact":2 , "like":3 ,"add to cart" : 4,"checkout" : 5 }

In [12]:
ratings.bookRating = ratings.bookRating.replace(ratingDic)
ratings.head()

Unnamed: 0,ratingId,userID,book_id,bookRating
0,523113,126736,843946806,0
1,861298,208406,345353145,3
2,37104,8890,2020213508,4
3,328497,78553,451402383,4
4,121368,27875,307129659,4


In [201]:
# plt.rc("font", size=15)
# ratings.bookRating.value_counts(sort=False).plot(kind='bar')
# plt.title('Rating Distribution\n')
# plt.xlabel('Rating')
# plt.ylabel('Count')
# plt.savefig('system1.png', bbox_inches='tight')
# plt.show()

In [202]:
# books.head()

In [203]:
# users.head()

In [204]:
# users.Age.hist(bins=[0, 10, 20, 30, 40, 50, 100])
# plt.title('Age Distribution\n')
# plt.xlabel('Age')
# plt.ylabel('Count')
# plt.savefig('system2.png', bbox_inches='tight')
# plt.show()

In [36]:
combine_book_rating =pd.merge(ratings,books,on="book_id")
columns = ['ratingId','Id','bookAuthor','yearOfPublication','publisher','urlId','book_id']
combine_book_rating = combine_book_rating.drop(columns,axis=1)


combine_book_rating = combine_book_rating[['userID','bookTitle','bookRating']]
combine_book_rating.columns = ['user_id','title', 'rating']
combine_book_rating.head()

Unnamed: 0,user_id,title,rating
0,126736,Sweet Revenge,0
1,208406,Sphere,3
2,211231,Sphere,4
3,16538,Sphere,4
4,232449,Sphere,4


In [27]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(combine_book_rating[['user_id', 'title', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.25)

In [28]:
# train a new SVD with 100 latent features (number was chosen arbitrarily)
model = SVD(n_factors=100)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1bf39f0e908>

In [29]:

# Normalization
pd.DataFrame(model.qi).iloc[0].pow(2).sum()
model.qi /= np.linalg.norm(model.qi, ord=2, axis=1).reshape(-1, 1)
pd.DataFrame(model.qi).iloc[0].pow(2).sum()

0.9999999999999997

In [30]:
model.qi.shape

(60829, 100)

In [31]:

from scipy.spatial.distance import cosine


def get_vector_by_book_title(book_title: str, trained_model: SVD) -> np.array:
    """Returns the latent features of a movie in the form of a numpy array"""
    book_row_idx = trained_model.trainset._raw2inner_id_items[book_title]
    return trained_model.qi[book_row_idx]


def cosine_distance(vector_a: np.array, vector_b: np.array) -> float:
    """Returns a float indicating the similarity between two vectors"""
    return cosine(vector_a, vector_b)

In [32]:

# Fetch the vectors of "Toy Story" and "Wizard of Oz"
toy_story_vec = get_vector_by_book_title('Die vierte Hand.', model)
wizard_of_oz_vec = get_vector_by_book_title('Strange Beauty', model)

# Calculate the distance between the vectors. The smaller the number,
# the more similar the two movies are
similarity_score = cosine_distance(toy_story_vec, wizard_of_oz_vec)
similarity_score

1.1690824286873354

In [34]:
def get_top_similarities(book_title: str, model: SVD) -> pd.DataFrame:
    """Returns the top 5 most similar books to a specified book
    
    This function iterates over every possible book in bookLens and calculates
    distance between `book_title` vector and that book's vector.
    """
    
    # Get the first book vector
    book_vector: np.array = get_vector_by_book_title(book_title, model)
    similarity_table = []
    
    # Iterate over every possible book and calculate similarity
    for other_book_title in model.trainset._raw2inner_id_items.keys():
        other_book_vector = get_vector_by_book_title(other_book_title, model)
        
        # Get the second book vector, and calculate distance
        similarity_score = cosine_distance(other_book_vector, book_vector)
        similarity_table.append((similarity_score, other_book_title))
    
    # sort books by ascending similarity
    return display(sorted(similarity_table))

In [35]:
get_top_similarities('Strange Beauty', model)

[(0.0, 'Strange Beauty'),
 (0.55788817475107322, 'Tales of Mystery and Imagination'),
 (0.61895735412310282, 'Death Trilogy'),
 (0.61986640317497144, 'Autumn'),
 (0.63447258930456274,
  'The Lost Ones (Star Wars: Young Jedi Knights, Book 3)'),
 (0.64190460743499778,
  "The Botany of Desire: A Plant's-Eye View of the World"),
 (0.64353411669443128, 'Star Wars: A Storybook (Star Wars Series)'),
 (0.64489898134715085,
  'The Faber Book of Contemporary Australian Short Stories'),
 (0.64791776320715022, 'Il Coraggio Del Pettirosso'),
 (0.65251990389047276, "Rehearsal's Off!"),
 (0.65330025638618761, 'Opening to Channel: How to Connect With Your Guide'),
 (0.65483782886489172, 'Leyendas'),
 (0.65574780055994153, 'Autopsja czyli dziennik kryzysu'),
 (0.65577656548834518, 'Der Elefant verschwindet.'),
 (0.65598726489509041, "La gÃ\xa0bia d'or (Narrativa)"),
 (0.65813005497841326, 'Log from the Sea of Cortez'),
 (0.65898354210792265, 'Secret Strength: For Those Who Search'),
 (0.659329257124509

RMSE Evaluation

In [37]:
predictions = model.test(testset)
accuracy.rmse(predictions)

RMSE: 1.2383


1.2383082667003071