In [185]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

In [186]:
books_df = pd.read_csv('book600k-700k.csv', delimiter= ",")
books_df.head()

Unnamed: 0,Id,Name,Authors,ISBN,Rating,PublishYear,PublishMonth,PublishDay,Publisher,RatingDist5,RatingDist4,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,Language,pagesNumber,Description
0,600000,"Lessons Learned (Great Chefs, #2)",Nora Roberts,037351025X,3.74,1993,15,2,Silhouette,5:947,4:1016,3:1061,2:287,1:63,total:3374,86,eng,250,LESSONS LEARNED...<br /><br />Coordinating the...
1,600001,Walking by Faith: Lessons Learned in the Dark,Jennifer Rothschild,0633099325,4.27,2003,1,1,Lifeway Church Resources,5:367,4:246,3:109,2:22,1:5,total:749,7,,112,"At the age of fifteen, Jennifer Rothschild con..."
2,600003,Better Health in Africa: Experience and Lesson...,World Bank Group,0821328174,5.0,1994,1,1,World Bank Publications,5:1,4:0,3:0,2:0,1:0,total:1,1,,240,
3,600004,"The Blended Learning Book: Best Practices, Pro...",Josh Bersin,0787972967,4.1,2004,1,10,Pfeiffer,5:8,4:6,3:6,2:0,1:0,total:20,3,,319,<i>The Blended Learning Book</i> is your user'...
4,600005,Lessons Learned: Successes Achieved: Be Prepar...,Robert G. Gillio,0595417566,3.0,2006,30,11,iUniverse,5:0,4:0,3:1,2:0,1:0,total:1,0,,168,"""<b>Lessons Learned: Successes Achieved</b>"" w..."


In [187]:
ratings_df = pd.read_csv('user_rating_6000_to_11000.csv', delimiter= ",")
ratings_df.head()

Unnamed: 0,ID,Name,Rating
0,6675,"Baxter, the Pig Who Wanted to Be Kosher",it was amazing
1,6675,Set This House in Order,really liked it
2,6675,Paradise Park,really liked it
3,7027,Paradise Park,liked it
4,6675,The Dead Fish Museum,it was amazing


In [188]:
books_df.columns

Index(['Id', 'Name', 'Authors', 'ISBN', 'Rating', 'PublishYear',
       'PublishMonth', 'PublishDay', 'Publisher', 'RatingDist5', 'RatingDist4',
       'RatingDist3', 'RatingDist2', 'RatingDist1', 'RatingDistTotal',
       'CountsOfReview', 'Language', 'pagesNumber', 'Description'],
      dtype='object')

In [189]:
merged_df = pd.merge(books_df, ratings_df, on='Name')
merged_df.head()

Unnamed: 0,Id,Name,Authors,ISBN,Rating_x,PublishYear,PublishMonth,PublishDay,Publisher,RatingDist5,...,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,Language,pagesNumber,Description,ID,Rating_y
0,600044,Interrogations: The Nazi Elite in Allied Hands...,Richard Overy,142001589,3.95,2002,24,9,Penguin Books,5:46,...,3:34,2:7,1:2,total:166,8,,672,While the trial of Hitler's fallen elite at Nu...,9859,really liked it
1,600045,Russia's War: A History of the Soviet Effort: ...,Richard Overy,140271694,4.19,1998,1,8,Penguin Books,5:879,...,3:303,2:59,1:28,total:1977,56,,394,"<b>""A penetrating and compassionate book on th...",10627,really liked it
2,600051,Time of the Butcherbird,Alex La Guma,435907581,3.8,1987,1,1,Heinemann Educational Books,5:16,...,3:21,2:1,1:2,total:66,4,,128,"Out on the flat, featureless South African vel...",10378,it was amazing
3,600057,Justice at Nuremberg,Robert E. Conot,881840327,4.15,1993,28,1,Basic Books,5:88,...,3:40,2:6,1:1,total:225,16,eng,624,"Here, for the first time in one volume, is the...",9859,liked it
4,600061,Infinite in All Directions,Freeman Dyson,60915692,3.94,1989,1,2,Harper & Row,5:91,...,3:72,2:11,1:6,total:330,5,,319,Infinite In All Directions is a book on a wide...,10674,really liked it


In [190]:
merged_df.columns

Index(['Id', 'Name', 'Authors', 'ISBN', 'Rating_x', 'PublishYear',
       'PublishMonth', 'PublishDay', 'Publisher', 'RatingDist5', 'RatingDist4',
       'RatingDist3', 'RatingDist2', 'RatingDist1', 'RatingDistTotal',
       'CountsOfReview', 'Language', 'pagesNumber', 'Description', 'ID',
       'Rating_y'],
      dtype='object')

In [191]:
# Renaming the IDs for better interpretation of data as one is referring to the book's ID and the other one to the user rating it
merged_df.rename(columns={'Id': 'bookID'}, inplace=True)
merged_df.rename(columns={'ID': 'userID'}, inplace=True)
merged_df.columns

Index(['bookID', 'Name', 'Authors', 'ISBN', 'Rating_x', 'PublishYear',
       'PublishMonth', 'PublishDay', 'Publisher', 'RatingDist5', 'RatingDist4',
       'RatingDist3', 'RatingDist2', 'RatingDist1', 'RatingDistTotal',
       'CountsOfReview', 'Language', 'pagesNumber', 'Description', 'userID',
       'Rating_y'],
      dtype='object')

In [192]:
merged_df['bookID'].value_counts()

651603    323
662347    290
620121    234
608945    233
667456    220
         ... 
637047      1
636941      1
636915      1
636880      1
699839      1
Name: bookID, Length: 1651, dtype: int64

In [193]:
# Dropping duplicate book IDs
merged_df = merged_df.drop_duplicates(subset='bookID', keep='first')

In [194]:
merged_df.shape

(1651, 21)

In [195]:
# Number of missing values per column 
missing_per_column = merged_df.isnull().sum()
print(missing_per_column)

bookID               0
Name                 0
Authors              0
ISBN                25
Rating_x             0
PublishYear          0
PublishMonth         0
PublishDay           0
Publisher           12
RatingDist5          0
RatingDist4          0
RatingDist3          0
RatingDist2          0
RatingDist1          0
RatingDistTotal      0
CountsOfReview       0
Language           938
pagesNumber          0
Description         92
userID               0
Rating_y             0
dtype: int64


In [196]:
# Removing the rows that have no description
merged_df = merged_df.dropna(subset=['Description'])
# Removing the rows that are not English
merged_df = merged_df[merged_df['Language'].str.startswith('en', na=False)]

In [197]:
merged_df.shape

(679, 21)

In [198]:
merged_df['Language'].value_counts()

eng      552
en-US     99
en-GB     26
en-CA      2
Name: Language, dtype: int64

In [199]:
# # For ISBN and Publisher we can ignore these missing values as they are not really relevant to our analysis. 
# # Language, however, can be quite important for a book recommendation system, as it directly affects user preferences 
# # and accessibility. 

# Mode imputation for missing values:
mode_language = merged_df['Language'].mode()[0]
merged_df['Language'].fillna(mode_language, inplace=True)

In [200]:
# Tokenizing and lemmatizing columns:
def tokenize_and_lemmatize(col):
    # ensures col is a string
    col = str(col)
    # Load English stopwords
    stop_words = set(stopwords.words('english'))
    # Initialize the Lemmatizer
    lemmatizer = WordNetLemmatizer()
    # Tokenize the column
    tokens = word_tokenize(col.lower())
    # Filter out stopwords and non-alphabetic characters
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    # Lemmatize each filtered token
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return lemmatized_tokens

In [201]:
# Apply the tokenizing and lemmatization function to the "text" cols:
merged_df['Processed_Name'] = merged_df['Name'].apply(tokenize_and_lemmatize)
merged_df['Processed_Authors'] = merged_df['Authors'].apply(tokenize_and_lemmatize)
# Not that relevant for our analysis but just in case we use it later on:
merged_df['Processed_Publisher'] = merged_df['Publisher'].apply(tokenize_and_lemmatize)
merged_df['Processed_Rating'] = merged_df['Rating_y'].apply(tokenize_and_lemmatize)
merged_df['Processed_Description'] = merged_df['Description'].apply(tokenize_and_lemmatize)

# Transforming the list
merged_df['Processed_Description'] = merged_df['Processed_Description'].apply(lambda x: ' '.join(x))

In [202]:
merged_df.shape

(679, 26)

In [203]:
# Splitting into a training and validation set:
df_train_set, df_valid_set = train_test_split(merged_df, test_size=0.2, random_state=42, shuffle=True) # random state for reproducibility

# Description Model (Anne Marijn & Clarice)

In [204]:
# Imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [205]:
print(df_train_set.shape)
print(df_valid_set.shape)

(543, 26)
(136, 26)


In [206]:
tfidf_matrix.shape

(543, 10330)

In [207]:
def calculate_description_similarity(df):
    # Initialising tfidf vectorisation
    tf = TfidfVectorizer()
    
    # Fitting and transforming
    tfidf_matrix = tf.fit_transform(df['Processed_Description'])
    
    # Calculating cosine similarities of tfidf vectors
    description_similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    description_similarity_matrix = pd.DataFrame(description_similarity_matrix, 
                                                 index=df['Name'], 
                                                 columns=df['Name'])
    
    return description_similarity_matrix

In [208]:
description_matrix = calculate_description_similarity(df_train_set)

In [209]:
description_matrix.head()

Name,Orange Pear Apple Bear,Nobody's Angel,"Stupid History: Tales of Stupidity, Strangeness, and Mythconceptions Throughout the Ages",Give and Take,"The House of the Vestals (Roma Sub Rosa, #6)",Chronicle of a Death Foretold,The Great Bridge: The Epic Story of the Building of the Brooklyn Bridge,We,"Prince of Dogs (Crown of Stars, #2)",The Stand,...,The Leopard,The Pump House Gang,Trouble and Her Friends,"The Mageborn Traitor (Exiles, #2)",A Walk on the Wild Side,The Metamorphosis,Into the Forest,The Man Who Ate Everything,Nobody's Fool,Volcano: A Memoir of Hawai'i
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Orange Pear Apple Bear,1.0,0.014711,0.0,0.0,0.0,0.0,0.012786,0.0,0.003375,0.018281,...,0.0,0.008886,0.006251,0.049019,0.0,0.0,0.006035,0.03055,0.0,0.0
Nobody's Angel,0.014711,1.0,0.052087,0.04252,0.015332,0.048762,0.039863,0.046056,0.065747,0.013858,...,0.006552,0.0,0.059248,0.031543,0.07529,0.025742,0.028534,0.029033,0.03299,0.012218
"Stupid History: Tales of Stupidity, Strangeness, and Mythconceptions Throughout the Ages",0.0,0.052087,1.0,0.018487,0.023674,0.09791,0.055268,0.0,0.064679,0.002286,...,0.003161,0.010521,0.078934,0.055091,0.081268,0.058052,0.060694,0.027882,0.031464,0.0
Give and Take,0.0,0.04252,0.018487,1.0,0.0,0.046961,0.00343,0.0,0.023687,0.017456,...,0.0,0.0,0.016709,0.00485,0.038876,0.004181,0.013615,0.048568,0.002576,0.0
"The House of the Vestals (Roma Sub Rosa, #6)",0.0,0.015332,0.023674,0.0,1.0,0.014801,0.028614,0.016795,0.024704,0.011351,...,0.005855,0.04323,0.045025,0.022904,0.013825,0.023907,0.01518,0.0,0.018492,0.022972


In [236]:
def description_recommendations(book_title, similarity_matrix, num_recommendations=5):
    # Searching for corresponding similarity scores
    book_similarity_scores = description_matrix[book_title]
    # Sorting similarity scores
    similar_books_indices = book_similarity_scores.argsort()[::-1][1:5+1]
    similar_books_titles = df_train_set.iloc[similar_books_indices]['Name'].tolist()
    # Printing top 5 similar books
    for i in range(len(similar_books_titles)):
        print(f"Book {i+1}: {similar_books_titles[i]} - Similarity score: {book_similarity_scores[similar_books_titles[i]]:.2f}")

In [237]:
description_recommendations("The House of the Vestals (Roma Sub Rosa, #6)", description_matrix)

Book 1: The Venus Throw (Roma Sub Rosa, #4) - Similarity score: 0.32
Book 2: A Murder on the Appian Way (Roma Sub Rosa, #5) - Similarity score: 0.26
Book 3: Vox - Similarity score: 0.10
Book 4: The Return of Rafe MacKade (The MacKade Brothers, #1) - Similarity score: 0.09
Book 5: The Satyricon - Similarity score: 0.07


# Rating Model (Tereza)

Model 2: Calculation of similarity scores for “book rating” feature + implementing number of “written reviews” feature as weight to the vectors

In [110]:
from sklearn.metrics.pairwise import cosine_similarity

This function calculates the cosine similarity matrix between books based on their ratings.
It first converts the textual ratings ('amazing', 'really liked it', etc.) to numeric values (1-5).
Then it creates a pivot table with users as rows and books as columns, filling missing values with 0.
Finally, it calculates the cosine similarity between the transpose of the ratings matrix and returns the similarity matrix.

In [111]:
# Calculate cosine similarity between book ratings
def calculate_similarity(merged_df):
    # Convert rating strings to numeric values
    merged_df['Rating_numeric'] = merged_df['Processed_Rating'].apply(lambda x: 5 if 'amazing' in x else (4 if 'really, liked' in x else (3 if 'liked' in x else (2 if 'ok' in x else 1))))
    # Calculate cosine similarity matrix
    ratings_matrix = merged_df.pivot_table(index='userID', columns='bookID', values='Rating_numeric').fillna(0)
    similarity_matrix = cosine_similarity(ratings_matrix.T)
    return similarity_matrix


This function calculates weighted similarity scores based on the number of written reviews for each book.
It first calculates the total number of written reviews for each book.
Then it normalizes the number of written reviews to a range between 0 and 1.
Finally, it applies these weights to the similarity matrix and returns the weighted similarity matrix.


In [112]:
# Weighted similarity scores based on number of written reviews
def calculate_weighted_similarity(similarity_matrix, merged_df):
    # Get number of written reviews for each book
    written_reviews = merged_df.groupby('bookID')['CountsOfReview'].sum()
    # Normalize written reviews
    max_reviews = written_reviews.max()
    written_reviews_norm = written_reviews / max_reviews
    # Apply weights to similarity matrix
    weighted_similarity_matrix = similarity_matrix * written_reviews_norm.values.reshape(-1, 1)
    return weighted_similarity_matrix

In [113]:
# Calculate similarity matrix
similarity_matrix = calculate_similarity(df_train_set)

# Calculate weighted similarity matrix
weighted_similarity_matrix = calculate_weighted_similarity(similarity_matrix, df_train_set)

Recommend Books

In [117]:
def recommend_books(book_title, weighted_similarity_matrix, num_recommendations=5):
    # Find the index of the given book title
    book_index = df_train_set[df_train_set['Name'] == book_title].index[0]
    
    # Get the similarity scores for the given book
    book_similarity_scores = weighted_similarity_matrix[book_index]
    
    # Sort the similarity scores in descending order and get top N similar books
    similar_books_indices = book_similarity_scores.argsort()[::-1][1:num_recommendations+1]
    similar_books_titles = df_train_set.iloc[similar_books_indices]['Name'].tolist()
    
    return similar_books_titles

In [115]:
# Example: Recommend 5 books similar to "A Short History of Nearly Everything"
recommendations = recommend_books("A Short History of Nearly Everything", weighted_similarity_matrix)
print("Recommended Books:")
for i, book in enumerate(recommendations, 1):
    print(f"{i}. {book}")

IndexError: index 12788 is out of bounds for axis 0 with size 1467

References used:

https://www.kaggle.com/code/olih1029/book-rating 

https://www.kaggle.com/code/klaragerlei/book-recommendations-from-movies

https://www.kaggle.com/code/donkratas/best-book-of-every-year

In [118]:
recommend_books("A Short History of Nearly Everything", description_matrix)

KeyError: 12777