In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

In [2]:
books_df = pd.read_csv('book1-100k.csv', delimiter= ",")
books_df.head()

Unnamed: 0,Id,Name,RatingDist1,pagesNumber,RatingDist4,RatingDistTotal,PublishMonth,PublishDay,Publisher,CountsOfReview,PublishYear,Language,Authors,Rating,RatingDist2,RatingDist5,ISBN,RatingDist3
0,1,Harry Potter and the Half-Blood Prince (Harry ...,6.913888889,652,4:556485,total:2298124,16,9,Scholastic Inc.,28062,2006,eng,J.K. Rowling,4.57,2:25317,5:1546466,,3:159960
1,2,Harry Potter and the Order of the Phoenix (Har...,1:12455,870,4:604283,total:2358637,1,9,Scholastic Inc.,29770,2004,eng,J.K. Rowling,4.5,2:37005,5:1493113,439358078,3:211781
2,3,Harry Potter and the Sorcerer's Stone (Harry P...,1:108202,309,4:1513191,total:6587388,1,11,Scholastic Inc,75911,2003,eng,J.K. Rowling,4.47,2:130310,5:4268227,,3:567458
3,4,Harry Potter and the Chamber of Secrets (Harry...,1:11896,352,4:706082,total:2560657,1,11,Scholastic,244,2003,eng,J.K. Rowling,4.42,2:49353,5:1504505,439554896,3:288821
4,5,Harry Potter and the Prisoner of Azkaban (Harr...,1:10128,435,4:630534,total:2610317,1,5,Scholastic Inc.,37093,2004,eng,J.K. Rowling,4.57,2:24849,5:1749958,043965548X,3:194848


In [4]:
ratings_df = pd.read_csv('user_rating_0_to_1000.csv', delimiter= ",")
ratings_df.head()

Unnamed: 0,ID,Name,Rating
0,1,Agile Web Development with Rails: A Pragmatic ...,it was amazing
1,1,The Restaurant at the End of the Universe (Hit...,it was amazing
2,1,Siddhartha,it was amazing
3,1,The Clock of the Long Now: Time and Responsibi...,really liked it
4,1,"Ready Player One (Ready Player One, #1)",really liked it


In [5]:
books_df.columns

Index(['Id', 'Name', 'RatingDist1', 'pagesNumber', 'RatingDist4',
       'RatingDistTotal', 'PublishMonth', 'PublishDay', 'Publisher',
       'CountsOfReview', 'PublishYear', 'Language', 'Authors', 'Rating',
       'RatingDist2', 'RatingDist5', 'ISBN', 'RatingDist3'],
      dtype='object')

In [6]:
merged_df = pd.merge(books_df, ratings_df, on='Name')
merged_df.head()

Unnamed: 0,Id,Name,RatingDist1,pagesNumber,RatingDist4,RatingDistTotal,PublishMonth,PublishDay,Publisher,CountsOfReview,PublishYear,Language,Authors,Rating_x,RatingDist2,RatingDist5,ISBN,RatingDist3,ID,Rating_y
0,18,The Ultimate Hitchhiker's Guide (Hitchhiker's ...,2.432638889,815,4:75683,total:274268,17,1,Wings Books,194,1996,eng,Douglas Adams,4.37,5.370138889,5:157499,517149257,3:30030,332,it was amazing
1,21,A Short History of Nearly Everything,3.611111111,544,4:94040,total:276166,14,9,Broadway Books,9545,2004,eng,Bill Bryson,4.21,6.588194444,5:129555,076790818X,3:38064,1,it was amazing
2,21,A Short History of Nearly Everything,3.611111111,544,4:94040,total:276166,14,9,Broadway Books,9545,2004,eng,Bill Bryson,4.21,6.588194444,5:129555,076790818X,3:38064,18,it was ok
3,21,A Short History of Nearly Everything,3.611111111,544,4:94040,total:276166,14,9,Broadway Books,9545,2004,eng,Bill Bryson,4.21,6.588194444,5:129555,076790818X,3:38064,56,liked it
4,21,A Short History of Nearly Everything,3.611111111,544,4:94040,total:276166,14,9,Broadway Books,9545,2004,eng,Bill Bryson,4.21,6.588194444,5:129555,076790818X,3:38064,141,really liked it


In [7]:
merged_df.columns

Index(['Id', 'Name', 'RatingDist1', 'pagesNumber', 'RatingDist4',
       'RatingDistTotal', 'PublishMonth', 'PublishDay', 'Publisher',
       'CountsOfReview', 'PublishYear', 'Language', 'Authors', 'Rating_x',
       'RatingDist2', 'RatingDist5', 'ISBN', 'RatingDist3', 'ID', 'Rating_y'],
      dtype='object')

In [8]:
# Renaming the IDs for better interpretation of data as one is referring to the book's ID and the other one to the user rating it
merged_df.rename(columns={'Id': 'bookID'}, inplace=True)
merged_df.rename(columns={'ID': 'userID'}, inplace=True)
merged_df.columns

Index(['bookID', 'Name', 'RatingDist1', 'pagesNumber', 'RatingDist4',
       'RatingDistTotal', 'PublishMonth', 'PublishDay', 'Publisher',
       'CountsOfReview', 'PublishYear', 'Language', 'Authors', 'Rating_x',
       'RatingDist2', 'RatingDist5', 'ISBN', 'RatingDist3', 'userID',
       'Rating_y'],
      dtype='object')

In [9]:
# Number of missing values per column 
missing_per_column = merged_df.isnull().sum()
print(missing_per_column)

bookID                0
Name                  0
RatingDist1           0
pagesNumber           0
RatingDist4           0
RatingDistTotal       0
PublishMonth          0
PublishDay            0
Publisher            94
CountsOfReview        0
PublishYear           0
Language           4472
Authors               0
Rating_x              0
RatingDist2           0
RatingDist5           0
ISBN               1114
RatingDist3           0
userID                0
Rating_y              0
dtype: int64


In [10]:
# For ISBN and Publisher we can ignore these missing values as they are not really relevant to our analysis. 
# Language, however, can be quite important for a book recommendation system, as it directly affects user preferences 
# and accessibility. 

# Mode imputation for missing values:
mode_language = merged_df['Language'].mode()[0]
merged_df['Language'].fillna(mode_language, inplace=True)

In [11]:
# Tokenizing and lemmatizing columns:
def tokenize_and_lemmatize(col):
    # ensures col is a string
    col = str(col)
    # Load English stopwords
    stop_words = set(stopwords.words('english'))
    # Initialize the Lemmatizer
    lemmatizer = WordNetLemmatizer()
    # Tokenize the column
    tokens = word_tokenize(col.lower())
    # Filter out stopwords and non-alphabetic characters
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    # Lemmatize each filtered token
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return lemmatized_tokens

In [12]:
# Apply the tokenizing and lemmatization function to the "text" cols:
merged_df['Processed_Name'] = merged_df['Name'].apply(tokenize_and_lemmatize)
merged_df['Processed_Authors'] = merged_df['Authors'].apply(tokenize_and_lemmatize)
# Not that relevant for our analysis but just in case we use it later on:
merged_df['Processed_Publisher'] = merged_df['Publisher'].apply(tokenize_and_lemmatize)
merged_df['Processed_Rating'] = merged_df['Rating_y'].apply(tokenize_and_lemmatize)

In [13]:
print(merged_df[['Rating_y', 'Processed_Rating']].head())

          Rating_y Processed_Rating
0   it was amazing        [amazing]
1   it was amazing        [amazing]
2        it was ok             [ok]
3         liked it          [liked]
4  really liked it  [really, liked]


In [14]:
# Splitting into a training and validation set:
df_train_set, df_valid_set = train_test_split(merged_df, test_size=0.2, random_state=42, shuffle=True) # random state for reproducibility

Model 2: Calculation of similarity scores for “book rating” feature + implementing number of “written reviews” feature as weight to the vectors

In [26]:
from sklearn.metrics.pairwise import cosine_similarity

This function calculates the cosine similarity matrix between books based on their ratings.
It first converts the textual ratings ('amazing', 'really liked it', etc.) to numeric values (1-5).
Then it creates a pivot table with users as rows and books as columns, filling missing values with 0.
Finally, it calculates the cosine similarity between the transpose of the ratings matrix and returns the similarity matrix.

In [27]:
# Calculate cosine similarity between book ratings
def calculate_similarity(merged_df):
    # Convert rating strings to numeric values
    merged_df['Rating_numeric'] = merged_df['Processed_Rating'].apply(lambda x: 5 if 'amazing' in x else (4 if 'really, liked' in x else (3 if 'liked' in x else (2 if 'ok' in x else 1))))
    # Calculate cosine similarity matrix
    ratings_matrix = merged_df.pivot_table(index='userID', columns='bookID', values='Rating_numeric').fillna(0)
    similarity_matrix = cosine_similarity(ratings_matrix.T)
    return similarity_matrix


This function calculates weighted similarity scores based on the number of written reviews for each book.
It first calculates the total number of written reviews for each book.
Then it normalizes the number of written reviews to a range between 0 and 1.
Finally, it applies these weights to the similarity matrix and returns the weighted similarity matrix.


In [28]:
# Weighted similarity scores based on number of written reviews
def calculate_weighted_similarity(similarity_matrix, merged_df):
    # Get number of written reviews for each book
    written_reviews = merged_df.groupby('bookID')['CountsOfReview'].sum()
    # Normalize written reviews
    max_reviews = written_reviews.max()
    written_reviews_norm = written_reviews / max_reviews
    # Apply weights to similarity matrix
    weighted_similarity_matrix = similarity_matrix * written_reviews_norm.values.reshape(-1, 1)
    return weighted_similarity_matrix

In [29]:
# Calculate similarity matrix
similarity_matrix = calculate_similarity(df_train_set)

# Calculate weighted similarity matrix
weighted_similarity_matrix = calculate_weighted_similarity(similarity_matrix, df_train_set)

Recommend Books

In [30]:
def recommend_books(book_title, weighted_similarity_matrix, num_recommendations=5):
    # Find the index of the given book title
    book_index = df_train_set[df_train_set['Name'] == book_title].index[0]
    
    # Get the similarity scores for the given book
    book_similarity_scores = weighted_similarity_matrix[book_index]
    
    # Sort the similarity scores in descending order and get top N similar books
    similar_books_indices = book_similarity_scores.argsort()[::-1][1:num_recommendations+1]
    similar_books_titles = df_train_set.iloc[similar_books_indices]['Name'].tolist()
    
    return similar_books_titles

In [31]:
# Example: Recommend 5 books similar to "A Short History of Nearly Everything"
recommendations = recommend_books("A Short History of Nearly Everything", weighted_similarity_matrix)
print("Recommended Books:")
for i, book in enumerate(recommendations, 1):
    print(f"{i}. {book}")

Recommended Books:
1. Naked
2. Wuthering Heights
3. Pet Sematary
4. Mother Night
5. V for Vendetta


References used:

https://www.kaggle.com/code/olih1029/book-rating 

https://www.kaggle.com/code/klaragerlei/book-recommendations-from-movies

https://www.kaggle.com/code/donkratas/best-book-of-every-year