In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyjarowinkler import distance

import warnings
warnings.filterwarnings('ignore')
import os

os.chdir('data/')

In [2]:
items_ = pd.read_csv('items.csv',delimiter='\|',engine='python')
items_cleansed = pd.read_csv('items_cleansed.csv',delimiter='\|',engine='python')
items_deduplicated_cleansed = pd.read_csv('items_deduplicated_cleansed.csv',delimiter='\|',engine='python')
eval_ = pd.read_csv('evaluation.csv',delimiter='\|',engine='python')

In [3]:
items_[items_['itemID']==np.int64(28922)]

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics
61202,28922,A Christmas Carol,Charles Dickens,1st World Library - Literary Society,YFD,[]


In [4]:
items_deduplicated_cleansed[items_deduplicated_cleansed['itemID']==np.int64(21179)]

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics
20856,21179,a heat wave in the hellers,deborah j ross,trowbridge ross,fm,


In [5]:
def jaro_winkler_sim(string_1 : str, string_2: str) -> float:
    return distance.get_jaro_distance(string_1, string_2, winkler=True)

In [6]:
def get_recommendations(item_id : np.int64) -> tuple:
    similarities_=[]

    book_entry = items_.loc[items_['itemID'] == item_id].reset_index()
    book_entry_cleansed = items_cleansed.loc[items_cleansed['itemID'] == item_id].reset_index()

    author = str(book_entry_cleansed.at[0,'author']) # string
    title = str(book_entry_cleansed.at[0,'title']) # string
    publisher = str(book_entry_cleansed.at[0,'publisher']) # string
    topic = str(book_entry_cleansed.at[0,'main topic']) # string
    # iteration over items
    for i in items_deduplicated_cleansed.index:
        book_compare_id = items_deduplicated_cleansed.at[i, 'itemID'] # np.int64
        ##
        author_compare = str(items_deduplicated_cleansed.at[i, 'author']) # string
        title_compare = str(items_deduplicated_cleansed.at[i, 'title']) # string
        publisher_compare = str(items_deduplicated_cleansed.at[i, 'publisher']) # string
        topic_compare = str(items_deduplicated_cleansed.at[i, 'main topic']) # string
        ##
        author_sim = jaro_winkler_sim(author, author_compare)
        title_sim = jaro_winkler_sim(title, title_compare)
        publisher_sim = jaro_winkler_sim(publisher, publisher_compare)
        topic_sim = jaro_winkler_sim(topic, topic_compare)
        book_sim = ((author_sim + title_sim + publisher_sim+topic_sim) / 4)
        if book_sim == float(1):
            continue
        if len(similarities_) < 5:
            similarities_.append(tuple([book_compare_id, book_sim]))
        else:
            book_similarities = [x[1] for x in similarities_]
            if all(book_sim >= s[1] for s in similarities_):
                similarities_.append(tuple([book_compare_id, book_sim]))
            similarities_.sort(key=lambda s:s[1], reverse=True)
            similarities_ = similarities_[:5]
    recommendations = items_[items_['itemID'].isin([s[0] for s in similarities_])]
    recommendations['sim'] = np.nan
    for entry_ in [s for s in similarities_]:
        recommendations.loc[(recommendations['itemID'] == entry_[0]), 'sim'] = entry_[1]
    return book_entry, recommendations.sort_values(by=['sim'],ascending=False)

In [7]:
test_eval_ = eval_.at[420,'itemID']

In [8]:
recommendations_ = get_recommendations(test_eval_)
display(recommendations_[0])
display(recommendations_[1])

Unnamed: 0,index,itemID,title,author,publisher,main topic,subtopics
0,43455,75539,Memory Hunter,Frank Morin,Whipsaw Press,FMX,[]


Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,sim
40657,54476,Rune Warrior,Frank Morin,Whipsaw Press,FMH,[],0.82
40654,39518,A Stone's Throw,Frank Morin,Whipsaw Press,YFH,[],0.785
16421,2985,Meow,Skye Mackinnon,Peryton Press,FMX,[],0.7525
61401,1967,Der Ruf des Jaguars,Franz A. Koch,Edition Spuren,FM,[],0.7225
29946,1074,The Warrior of Elyon,Aaron W. Baldwin,Westbow Press,FM,[],0.69
