Use TF IDF Vectoriser to transform text into vectors

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd
from bibrec.server.Utils import *



books = get_books("./data/BX-Books.csv")
book_data = pd.read_csv("./data/editions_dump.csv")
CHUNK_QUANTITY = 50
tfidf = TfidfVectorizer(stop_words='english')
book_data = pd.DataFrame(book_data).drop_duplicates(subset=["isbn_10"], ignore_index=True)
similarities = []
mappings = []



  df = pd.read_csv(path, sep=";", encoding="latin-1", index_col=False)
  book_data = pd.read_csv("./data/editions_dump.csv")


Preprocessing data to remove unnessecary spaces and characters

In [27]:
def book_data_preprocessing():
    global book_data
    book_data["book_info"] = book_data["book_info"].astype("string")
    book_data["book_info"] = book_data["book_info"].apply(lambda x: x.lower())
    book_data["book_info"] = book_data["book_info"].apply(lambda x: x.replace("(", ''))
    book_data["book_info"] = book_data["book_info"].apply(lambda x: x.replace(")", ''))
    book_data["book_info"] = book_data["book_info"].apply(lambda x: x.replace(',', ''))
    book_data["book_info"] = book_data["book_info"].apply(lambda x: x.replace('.', ''))
    book_data["book_info"] = book_data["book_info"].apply(lambda x: x.replace("'", ''))
    book_data["book_info"] = book_data["book_info"].apply(lambda x: x.replace("--", ''))
    book_data["book_info"] = book_data["book_info"].apply(lambda x: x.replace("  ", ' '))
    book_data["book_info"] = book_data["book_info"].apply(lambda x: ' '.join(list(set(x.split()))))

Create a Similarity Matrix for all books using cosine similarity

In order to avoid running out of memory while calculating the Matrix we decided to split the data into smaller chunks
- This has the effect that only similarities for books in the same chunk are calculated

In [28]:
def calculate_similarities():
    global similarities
    global mappings
    global book_data
    # data cleaning
    book_data.rename(columns={'isbn_10': 'isbn', 'first_sentence.value': 'first_sentence', 'notes.value': 'notes'}, inplace=True)
    book_data.isbn = book_data.isbn.map(lambda x: str(x).strip()[2:-2])
    book_data = books.merge(book_data, on="isbn", how="inner")
    book_data = book_data.drop(columns=["Unnamed: 0", "isbn_13", "publishers", "title", "first_sentence.type", "notes.type"])
    book_data.subjects = book_data.subjects.map(lambda x: str(x).strip()[2:-2])
    book_data.genres = book_data.genres.map(lambda x: str(x).strip()[2:-2])
    book_data = book_data.fillna(" ")

    # add book_info column and append relevant data
    book_data["book_info"] = book_data["book_title"] + " " + book_data["subjects"] + " " + book_data["genres"]
    book_data_preprocessing()

    similarities = []
    mappings = []
    for i in range(0, CHUNK_QUANTITY):
        len_books = book_data.shape[0] / CHUNK_QUANTITY
        chunk_data = book_data[int(i * len_books):int((i + 1) * len_books)]
        chunk_data = chunk_data.reset_index(drop=True)
        similarity_matrix = get_similarity_matrix(chunk_data)
        similarities.append(similarity_matrix)
        mappings.append(pd.Series(chunk_data.index, index=chunk_data["isbn13"]))

def get_similarity_matrix(book_data):
    tfidf_matrix = tfidf.fit_transform(book_data["book_info"])
    similarity_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)
    return similarity_matrix

All similar items are gathered for the given isbn and sorted in a descending order.
The number of returned items is split at n

In [29]:
def recommend_tf_idf(isbn13, n=15):
    global similarities
    global mappings
    global book_data
    book_indices = []
    for i in range(0, CHUNK_QUANTITY):
        if isbn13 in mappings[i].index:
            similarity_matrix = similarities[i]
            book_index = mappings[i][isbn13]

            if isinstance(book_index, pd.Series):
                book_index = book_index[0]
            similarity_score = list(enumerate(similarity_matrix[book_index]))
            # Sort the books based on the similarity scores
            similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
            similarity_score = similarity_score[1:n + 1]

            # Check if all similarities are 0
            if all(i[1] == 0 for i in similarity_score):
                return None

            book_indices.append([i[0] for i in similarity_score])

    if len(book_indices) == 0:
        return None
    return book_data[["book_title", "isbn13", "isbn"]].iloc[book_indices[0]]

In [30]:
calculate_similarities()
print(recommend_tf_idf("9780771074677"))

                                             book_title         isbn13  \
3648                             Perdido Street Station  9780345443021   
4002  The Arabian Nights: More Marvels and Wonders o...  9780451527493   
4003  Arabian Nights: The Marvels and Wonders of the...  9780451525420   
2635                           Night's Pawn (Shadowrun)  9780451452382   
4292                       On the Street Where You Live  9780671004538   
45                                    The Street Lawyer  9780440225706   
2736                                      Eureka Street  9782264027757   
2871    Scarabian Nights:Sabrina, The Teenage Witch #24  9780671028046   
502                                   The Street Lawyer  9780385490993   
3610                           The Cater Street Hangman  9780449208670   
2226  On Basilisk Station (Honor Harrington Series, ...  9780671577728   
523                               Her Mother's Daughter  9780345353627   
2645                           The Thi