This notebook includes the official implementation of the paper [**Book recommendation system using TF-IDF and cosine similarity**](https://doi.org/10.1063/5.0212477) (InCASA 2023).

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity

import nltk
import re
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))

from tabulate import tabulate

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Import Data
data = pd.read_csv("/kaggle/input/book-recommendation-dataset/Books.csv", nrows=67500)
data = data[["ISBN", "Book-Title", "Book-Author", "Year-Of-Publication", "Publisher"]]
data.columns = ["ISBN", "Title", "Author", "Year", "Publisher"]

In [3]:
# Data Cleaning
# Remove Missing Value Data
print(data.isnull().sum())
data = data.dropna()
print(data.isnull().sum())

ISBN         0
Title        0
Author       0
Year         0
Publisher    0
dtype: int64
ISBN         0
Title        0
Author       0
Year         0
Publisher    0
dtype: int64


In [4]:
# Data Preprocessing
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["cleaned_Title"] = data["Title"].apply(clean)

In [5]:
indices = pd.Series(data.index,index=data['cleaned_Title']).drop_duplicates()

In [6]:
# Generate Similarity
def get_similarity_cosine(title):
    feature = data["cleaned_Title"].tolist()
    tfidf = text.TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(feature)
    similarity_cosine = cosine_similarity(tfidf_matrix)
    del tfidf
    del tfidf_matrix
    del feature
    index = pd.Series(indices[title])
    return similarity_cosine[index[0]]

In [7]:
# Generate Similarity
def similarity(ISBN, title, similarity_scores, reverse):
    similarity_scores = list(enumerate(similarity_scores))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=reverse)
    similarity_scores = filter(lambda x: x[1] > 0, similarity_scores)
    similarity_scores = list(similarity_scores)
    movieindices = [i[0] for i in similarity_scores if i[0] < len(data)]
    scores = [i[1] for i in similarity_scores if i[0] < len(data)]
    result = pd.DataFrame([data.iloc[i] for i in movieindices])
    result['Similarity'] = scores
    result = result[['ISBN', 'Title', 'Similarity']]
    result = result[result["ISBN"] != ISBN]
    result2 = result[result["Similarity"] <= 0.5]
    result = result[result["Similarity"] > 0.5]
    result = result[:20] if result.count()[0] >= 20 else result[:result.count()[0]]
    result2 = result2[:20] if result2.count()[0] >= 20 else result2[:result2.count()[0]]
    result = result.set_index([np.arange(1, result.count()[0] + 1)])
    result2 = result2.set_index([np.arange(1, result2.count()[0] + 1)])
    print("ISBN: ", ISBN)
    print("Title: ", title)
    print("Book Recommendation:")
    
    print(tabulate(result, headers='keys', tablefmt = 'psql'))
    print(tabulate(result2, headers='keys', tablefmt = 'psql'))
        
    del similarity_scores
    del movieindices
    del scores
    del result
    del result2

In [8]:
# Generate Recommendation
def book_recommendation(ISBN, title):
    clean_title = clean(title)
    similarity_cosine = get_similarity_cosine(clean_title)
    
    similarity(ISBN, title, similarity_cosine, True)
    
    del similarity_cosine
    del clean_title

In [9]:
def Book(ISBN):
    book = data.loc[data["ISBN"] == ISBN]
    title = book.Title.item()
    book_recommendation(ISBN, title)
    
    del book
    del title

Book("051513290X")

ISBN:  051513290X
Title:  Summer of Storms
Book Recommendation:
+----+------------+--------------------------------------------+--------------+
|    | ISBN       | Title                                      |   Similarity |
|----+------------+--------------------------------------------+--------------|
|  1 | 0399146741 | Summer of Storms                           |     1        |
|  2 | 0373114087 | Summer Storm (Harlequin Presents, No 1408) |     0.794218 |
|  3 | 0451201906 | Storm                                      |     0.740812 |
|  4 | 084393672X | Storm                                      |     0.740812 |
|  5 | 044040892X | Against the Storm                          |     0.740812 |
|  6 | 0553214225 | Summer                                     |     0.671713 |
|  7 | 1551662809 | Just For The Summer                        |     0.671713 |
|  8 | 0552995649 | Just for the Summer                        |     0.671713 |
|  9 | 0425064506 | Summer                              

In [10]:
Book("0821722999")

ISBN:  0821722999
Title:  Heart of the Country
Book Recommendation:
+----+------------+-------------------------+--------------+
|    | ISBN       | Title                   |   Similarity |
|----+------------+-------------------------+--------------|
|  1 | 0099533405 | Heart of the Country    |     1        |
|  2 | 0446611913 | Up Country              |     0.767146 |
|  3 | 0060809590 | In Country              |     0.767146 |
|  4 | 0446516570 | Up Country: A Novel     |     0.65195  |
|  5 | 0446605891 | With Heart              |     0.641473 |
|  6 | 0380767635 | Where the Heart Is      |     0.641473 |
|  7 | 0446603651 | Where the Heart Is      |     0.641473 |
|  8 | 1561002690 | Where the Heart Is      |     0.641473 |
|  9 | 0312252803 | The Country Life        |     0.624087 |
| 10 | 1841150371 | In the New Country      |     0.605442 |
| 11 | 0449908615 | Star Country            |     0.593234 |
| 12 | 0812522486 | The Little Country      |     0.590608 |
| 13 | 0312876491

In [11]:
Book("0517123207")

ISBN:  0517123207
Title:  Origin of Species
Book Recommendation:
+----+------------+------------------------------------------------------------------------+--------------+
|    |       ISBN | Title                                                                  |   Similarity |
|----+------------+------------------------------------------------------------------------+--------------|
|  1 | 1853267805 | THE ORIGIN OF SPECIES                                                  |     1        |
|  2 | 0553574043 | Species: A Novel                                                       |     0.718172 |
|  3 | 0375501037 | Darwin's Ghost: The Origin of Species Updated                          |     0.645249 |
|  4 | 0812570758 | Species II: A Novel                                                    |     0.594928 |
|  5 | 0674637526 | On the Origin of Species a Facsimile of the First (Harvard Paperbacks) |     0.585922 |
+----+------------+----------------------------------------------------

If you found our implementation useful, please consider citing our paper:

Christopher Gavra Reswara, Josua Nicolas, I. Made Danendra Widyatama, David David, Panji Arisaputra; Book recommendation system using TF-IDF and cosine similarity. AIP Conf. Proc. 10 May 2024; 3135 (1): 020003. [10.1063/5.0212477](https://doi.org/10.1063/5.0212477)