In [34]:
import pandas as pd

df = pd.read_parquet("/kaggle/working/goodreads_itemcf_small.parquet")

In [35]:
df.head()

Unnamed: 0,user_id,book_id,is_read,rating,is_reviewed
788,5,7071,0,0,0
789,5,7061,1,5,0
790,5,7057,1,5,0
791,5,7051,0,0,0
792,5,1611,0,0,0


In [36]:
df['user_idx'] = df['user_id'].astype('category').cat.codes
df['book_idx'] = df['book_id'].astype('category').cat.codes

user_to_id = dict(zip(df['user_idx'], df['user_id']))
book_to_id = dict(zip(df['book_idx'], df['book_id']))

In [37]:
from scipy.sparse import csr_matrix

num_users = df['user_idx'].nunique()
num_books = df['book_idx'].nunique()

R = csr_matrix(
    (df['rating'], (df['book_idx'], df['user_idx'])),
    shape=(num_books, num_users)
)


In [38]:
# from sklearn.metrics.pairwise import cosine_similarity
# item_sim = cosine_similarity(R, dense_output=False)

from sklearn.metrics.pairwise import cosine_similarity

#raw full similarity (sparse)
sim = cosine_similarity(R, dense_output=True)

#keep only top-k per item
top_k = 50
item_sim_k = np.zeros_like(sim)

for i in range(sim.shape[0]):
    row = sim[i]
    top_idx = np.argpartition(row, -top_k)[-top_k:]
    item_sim_k[i, top_idx] = row[top_idx]

item_sim = csr_matrix(item_sim_k)


In [39]:
import pickle

with open("/kaggle/working/bookid_to_title.pkl", "rb") as f:
    bookid_to_title = pickle.load(f)

In [40]:
def get_title(book_idx):
    real_id = book_to_id[book_idx]
    return bookid_to_title.get(real_id, f"Unknown Title ({real_id})")


In [41]:
import numpy as np

def hybrid_recommend(uidx, liked_book_idx, alpha=0.7, top_n=10):
    rated_books = R[:, uidx].nonzero()[0]

    #user-based (via item-item aggregation)
    if len(rated_books) > 0:
        user_scores = np.asarray(item_sim[rated_books].sum(axis=0)).ravel()
    else:
        user_scores = np.zeros(num_books)

    #item-item similarity to liked book
    feedback_scores = item_sim[liked_book_idx].toarray().ravel()

    hybrid_score = alpha * user_scores + (1 - alpha) * feedback_scores

    hybrid_score[rated_books] = -np.inf
    hybrid_score[liked_book_idx] = -np.inf

    top_books = np.argsort(hybrid_score)[-top_n:][::-1]
    return top_books, hybrid_score


In [42]:
def recommend(uidx, liked_book_idx, top_n=10):
    top_books, scores = hybrid_recommend(uidx, liked_book_idx)
    
    results = []
    for b in top_books:
        results.append({
            "book_idx": b,
            "book_id": book_to_id[b],
            "title": get_title(b),
            "score": float(scores[b])
        })
    return results


In [43]:
csv_to_real = dict(zip(book_id_map['book_id_csv'], book_id_map['book_id']))

with open("/kaggle/working/bookid_to_title.pkl", "rb") as f:
    bookid_to_title = pickle.load(f)

book_idx_to_bookid_csv = dict(zip(df['book_idx'], df['book_id']))

def get_title(book_idx):
    book_id_csv = book_idx_to_bookid_csv[book_idx]
    real_id = csv_to_real.get(book_id_csv)
    if real_id is None:
        return f"Unknown Book (csv:{book_id_csv})"
    return bookid_to_title.get(real_id, f"Unknown Title ({real_id})")

In [46]:
uidx = 25
liked_book_idx = 130
print(get_title(liked_book_idx))
recs = recommend(uidx, liked_book_idx, top_n=5)
print('\n')
for r in recs:
    print(r['title'], " | score:", r['score'])

The Adventures of Sherlock Holmes


City of Glass (The Mortal Instruments, #3)  | score: 27.455144687334688
The Giver (The Giver, #1)  | score: 21.37113604565483
Shadow and Bone (The Grisha, #1)  | score: 20.966757752139266
Matched (Matched, #1)  | score: 20.096500225616342
To Kill a Mockingbird  | score: 19.263914731157207
Twilight (Twilight, #1)  | score: 19.126329776486667
Graceling (Graceling Realm, #1)  | score: 18.97639339589158
The Great Gatsby  | score: 17.76288580834507
City of Fallen Angels (The Mortal Instruments, #4)  | score: 17.741288548382617
Eleanor & Park  | score: 15.413617652803815


In [49]:
uidx = 26
liked_book_idx = 133
print(get_title(liked_book_idx))
recs = recommend(uidx, liked_book_idx, top_n=5)
print('\n')
for r in recs:
    print(r['title'], " | score:", r['score'])

Harry Potter Boxset (Harry Potter, #1-7)


To Kill a Mockingbird  | score: 40.51039960954237
Catching Fire (The Hunger Games, #2)  | score: 39.2651214712732
Mockingjay (The Hunger Games, #3)  | score: 35.46727422993823
Harry Potter and the Deathly Hallows (Harry Potter, #7)  | score: 32.88674751062113
Harry Potter and the Goblet of Fire (Harry Potter, #4)  | score: 32.40602128219616
Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)  | score: 31.975083981305833
Harry Potter and the Chamber of Secrets (Harry Potter, #2)  | score: 31.884699494198514
Harry Potter and the Half-Blood Prince (Harry Potter, #6)  | score: 31.520161818486386
Harry Potter and the Order of the Phoenix (Harry Potter, #5)  | score: 31.294979620451077
The Diary of a Young Girl  | score: 27.449934957772733


In [50]:
def find_book_idx_by_title(title):
    for book_id, t in bookid_to_title.items():
        if title.lower() in t.lower():
            idxs = [bidx for bidx, rid in book_to_id.items() if rid == book_id]
            if idxs:
                return idxs[0]
    return None

---------------------------------------

In [3]:
import pandas as pd
import json
from itertools import islice

def read_json_head(path, n=5):
    try:
        df = pd.read_json(path, lines=True, chunksize=n)
        return next(df).head()
    except:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
            return pd.DataFrame(data[:n])


In [4]:
import numpy as np
import pandas as pd
import math
import pickle
import os
from collections import defaultdict


In [5]:
INTERACTIONS_PATH = "/kaggle/input/interactions/goodreads_interactions.csv"
BOOK_ID_MAP_PATH  = "/kaggle/input/goodreads/book_id_map.csv"     
USER_ID_MAP_PATH  = "/kaggle/input/goodreads/user_id_map.csv"      
GOODREADS_BOOKS_PATH = "/kaggle/input/goodreads/goodreads_books.json"  

SIM_MATRIX_PATH = "similarity_matrix_books.pkl"


In [6]:
interactions = pd.read_csv(INTERACTIONS_PATH)
book_id_map  = pd.read_csv(BOOK_ID_MAP_PATH)
# user_id_map  = pd.read_csv(USER_ID_MAP_PATH)

print("Interations head:")
print(interactions.head())

print("book_id_map head:")
print(book_id_map.head())

# print("user_id_map head:")
# print(user_id_map.head())

Interations head:
   user_id  book_id  is_read  rating  is_reviewed
0        0      948        1       5            0
1        0      947        1       5            1
2        0      946        1       5            0
3        0      945        1       5            0
4        0      944        1       5            0
book_id_map head:
   book_id_csv   book_id
0            0  34684622
1            1  34536488
2            2  34017076
3            3     71730
4            4  30422361


In [5]:
read_json_head('/kaggle/input/goodreads/goodreads_books.json')

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,312853122.0,1,[],US,,"[{'count': '3', 'name': 'to-read'}, {'count': ...",,False,4.0,,...,9.0,,1984.0,https://www.goodreads.com/book/show/5333265-w-...,https://images.gr-assets.com/books/1310220028m...,5333265,3,5400751,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film
1,743509986.0,6,[],US,,"[{'count': '2634', 'name': 'to-read'}, {'count...",,False,3.23,B000FC0PBC,...,10.0,Abridged,2001.0,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,1333909,10,1323437,Good Harbor,Good Harbor
2,,7,[189911],US,eng,"[{'count': '58', 'name': 'to-read'}, {'count':...",B00071IKUY,False,4.03,,...,,Book Club Edition,1987.0,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,7327624,140,8948723,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ..."
3,743294297.0,3282,[],US,eng,"[{'count': '7615', 'name': 'to-read'}, {'count...",,False,3.49,B002ENBLOK,...,7.0,,2009.0,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,6066819,51184,6243154,Best Friends Forever,Best Friends Forever
4,850308712.0,5,[],US,,"[{'count': '32', 'name': 'to-read'}, {'count':...",,False,3.4,,...,,,,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,287140,15,278577,Runic Astrology: Starcraft and Timekeeping in ...,Runic Astrology: Starcraft and Timekeeping in ...


In [2]:
import numpy as np
import pandas as pd
import json
import math
import pickle
import os
from collections import defaultdict


In [24]:
book_id_map = pd.read_csv("/kaggle/input/goodreads/book_id_map.csv")     
user_id_map = pd.read_csv("/kaggle/input/goodreads/user_id_map.csv")      

# print("Interactions head:")
# print(interactions.head())
print("book_id_map head:")
print(book_id_map.head())
print("user_id_map head:")
print(user_id_map.head())
# print("goodreads_books head:")
# print(goodreads_books[['book_id', 'title']].head())


book_id_map head:
   book_id_csv   book_id
0            0  34684622
1            1  34536488
2            2  34017076
3            3     71730
4            4  30422361
user_id_map head:
   user_id_csv                           user_id
0            0  8842281e1d1347389f2ab93d60773d4d
1            1  72fb0d0087d28c832f15776b0d936598
2            2  ab2923b738ea3082f5f3efcbbfacb218
3            3  d986f354a045ffb91234e4af4d1b12fd
4            4  7504b2aee1ecb5b2872d3da381c6c91e


In [7]:
import pandas as pd

GOODREADS_BOOKS_PATH = "/kaggle/input/goodreads/goodreads_books.json"
chunks = pd.read_json(GOODREADS_BOOKS_PATH, lines=True, chunksize=50_000)

book_meta_list = []

for chunk in chunks:
    small = chunk[['book_id', 'title']].copy()
    book_meta_list.append(small)
goodreads_books = pd.concat(book_meta_list, ignore_index=True)
print(goodreads_books.head())

   book_id                                              title
0  5333265                        W.C. Fields: A Life on Film
1  1333909                                        Good Harbor
2  7327624  The Unschooled Wizard (Sun Wolf and Starhawk, ...
3  6066819                               Best Friends Forever
4   287140  Runic Astrology: Starcraft and Timekeeping in ...


In [12]:
bookid_to_title = dict(zip(goodreads_books['book_id'], goodreads_books['title']))

In [13]:
import pickle

with open("bookid_to_title.pkl", "wb") as f:
    pickle.dump(bookid_to_title, f)

print("Saved bookid_to_title.pkl")

Saved bookid_to_title.pkl


In [14]:
print("Unique books:", interactions['book_id'].nunique())
print("Unique users:", interactions['user_id'].nunique())

Unique books: 2360650
Unique users: 876145


In [15]:
def density_filter(df, min_user=20, min_book=20, max_iters=5):
    for _ in range(max_iters):
        before = df.shape[0]

        #to filter users
        user_counts = df['user_id'].value_counts()
        good_users = user_counts[user_counts >= min_user].index
        df = df[df['user_id'].isin(good_users)]

        #t9 filter books
        book_counts = df['book_id'].value_counts()
        good_books = book_counts[book_counts >= min_book].index
        df = df[df['book_id'].isin(good_books)]

        after = df.shape[0]
        if before == after:
            break

    return df


#interactions that have a rating
df = interactions[interactions['rating'] > 0].copy()
print("Initial interactions:", df.shape)

#density filtering
df_filtered = density_filter(df, min_user=20, min_book=20)
print("\nAfter density filtering:", df_filtered.shape)

#sample to 100k
TARGET = 100_000
if df_filtered.shape[0] > TARGET:
    df_filtered = df_filtered.sample(n=TARGET, random_state=42)
    print("After final sampling:", df_filtered.shape)

#final stats
print("\nFinal unique users:", df_filtered['user_id'].nunique())
print("Final unique books:", df_filtered['book_id'].nunique())
print("Final interactions:", df_filtered.shape[0])


Initial interactions: (104551549, 5)

After density filtering: (93995919, 5)
After final sampling: (100000, 5)

Final unique users: 81629
Final unique books: 48357
Final interactions: 100000


In [16]:
df_filtered.head(5)

Unnamed: 0,user_id,book_id,is_read,rating,is_reviewed
189859465,399226,4461,1,3,0
50597542,100895,2210,1,4,0
105980604,216054,271276,1,3,0
35242794,70069,240617,1,2,0
165850995,346123,86648,1,3,0


In [17]:
df_filtered.to_csv("filtered_interactions.csv", index=False)
print("Saved filtered_interactions.csv")

Saved filtered_interactions.csv


In [18]:
import pandas as pd
df_filtered = pd.read_csv("/kaggle/input/filt-int/filtered_interactions.csv")
print("Loaded filtered dataset:", df_filtered.shape)


Loaded filtered dataset: (100000, 5)


In [19]:
# df_filtered must have: user_id, book_id, rating

print("Filtered dataset:", df_filtered.shape)

user_list = df_filtered['user_id'].unique()
user_to_idx = {u: i for i, u in enumerate(user_list)}
idx_to_user = {i: u for u, i in user_to_idx.items()}

book_list = df_filtered['book_id'].unique()
book_to_idx = {b: i for i, b in enumerate(book_list)}
idx_to_book = {i: b for b, i in book_to_idx.items()}

df_filtered['user_idx'] = df_filtered['user_id'].map(user_to_idx)
df_filtered['book_idx'] = df_filtered['book_id'].map(book_to_idx)

num_users = len(user_list)
num_books = len(book_list)

print("Users:", num_users, "| Books:", num_books)


Filtered dataset: (100000, 5)
Users: 81629 | Books: 48357


In [20]:
df_filtered

Unnamed: 0,user_id,book_id,is_read,rating,is_reviewed,user_idx,book_idx
0,399226,4461,1,3,0,0,0
1,100895,2210,1,4,0,1,1
2,216054,271276,1,3,0,2,2
3,70069,240617,1,2,0,3,3
4,346123,86648,1,3,0,4,4
...,...,...,...,...,...,...,...
99995,413815,13699,1,4,0,81625,6489
99996,306023,77534,1,5,0,81626,44884
99997,119926,6868,1,5,0,81627,7455
99998,214823,270029,1,2,0,81628,48355


In [21]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import pickle

def prepare_data(df):
    #create mappings
    user_to_book = df.groupby('user_idx')['book_idx'].apply(list).to_dict()
    book_to_user = df.groupby('book_idx')['user_idx'].apply(list).to_dict()
    
    #create rating lookup - using tuples as keys
    user_book_to_rating = df.set_index(['user_idx', 'book_idx'])['rating'].to_dict()
    
    #calculate user means vectorized
    user_mean = df.groupby('user_idx')['rating'].mean().to_dict()
    
    #calculate book means vectorized
    book_mean = df.groupby('book_idx')['rating'].mean().to_dict()
    
    return user_to_book, book_to_user, user_book_to_rating, user_mean, book_mean


def compute_similarity_matrix_vectorized(df, min_common=5):
    #get user means
    user_mean = df.groupby('user_idx')['rating'].mean()
    
    #center ratings by subtracting user mean
    df_centered = df.copy()
    df_centered['rating_centered'] = df_centered.apply(
        lambda row: row['rating'] - user_mean[row['user_idx']], axis=1
    )
    
    #create user-book matrix with centered ratings
    n_users = df['user_idx'].max() + 1
    n_books = df['book_idx'].max() + 1
    
    #create sparse matrix (books x users)
    book_user_matrix = csr_matrix(
        (df_centered['rating_centered'].values, 
         (df_centered['book_idx'].values, df_centered['user_idx'].values)),
        shape=(n_books, n_users)
    )
    
    #compute cosine similarity between books
    #this is adjusted cosine since we centered by user means
    similarity_matrix = cosine_similarity(book_user_matrix, dense_output=False)
    
    #apply min_common constraint
    #count common users for each book pair
    book_user_binary = csr_matrix(
        (np.ones(len(df)), 
         (df['book_idx'].values, df['user_idx'].values)),
        shape=(n_books, n_users)
    )
    common_users_count = book_user_binary.dot(book_user_binary.T)
    
    #zero out similarities where common users < min_common
    similarity_matrix = similarity_matrix.multiply(common_users_count >= min_common)
    
    #set diagonal to 1
    similarity_matrix.setdiag(1)
    
    return similarity_matrix.toarray()


def predict_rating_vectorized(user_idx, target_book, user_to_book, 
                              user_book_to_rating, book_mean, 
                              similarity_matrix, k=25):
    #get books rated by user
    rated_books = np.array(user_to_book.get(user_idx, []))
    
    if len(rated_books) == 0:
        return book_mean.get(target_book, 3.0)  # Default rating
    
    #get similarities for target book with all rated books
    similarities = similarity_matrix[target_book, rated_books]
    
    # srt by absolute similarity and take top k
    top_k_indices = np.argsort(np.abs(similarities))[-k:][::-1]
    neighbor_books = rated_books[top_k_indices]
    neighbor_sims = similarities[top_k_indices]
    
    # Vectorized computation of weighted prediction
    ratings = np.array([user_book_to_rating.get((user_idx, book), 0) 
                       for book in neighbor_books])
    base_ratings = np.array([book_mean.get(book, 0) for book in neighbor_books])
    deviations = ratings - base_ratings
    
    numerator = np.sum(neighbor_sims * deviations)
    denominator = np.sum(np.abs(neighbor_sims))
    
    if denominator == 0:
        return book_mean.get(target_book, 3.0)
    
    prediction = book_mean.get(target_book, 3.0) + numerator / denominator
    return prediction


def generate_recommendations_vectorized(target_user_idx, df, similarity_matrix, 
                                       user_to_book, book_to_user, 
                                       user_book_to_rating, book_mean, 
                                       k=25, top_n=5):
    #get all books
    all_books = set(book_to_user.keys())
    rated_by_user = set(user_to_book.get(target_user_idx, []))
    candidate_books = list(all_books - rated_by_user)
    
    #predict ratings for all candidate books
    predictions = []
    for book_idx in candidate_books:
        pred_rating = predict_rating_vectorized(
            target_user_idx, book_idx, user_to_book, 
            user_book_to_rating, book_mean, similarity_matrix, k
        )
        predictions.append((book_idx, pred_rating))
    
    #sort and return top N
    predictions.sort(key=lambda x: x[1], reverse=True)
    return predictions[:top_n]

In [32]:
user_to_book, book_to_user, user_book_to_rating, user_mean, book_mean = prepare_data(df_filtered)
#compute similarity matrix
print("Computing similarity matrix...")
similarity_matrix = compute_similarity_matrix_vectorized(df_filtered,
min_common=5)

target_user_idx = 25 
print(f"\nGenerating recommendations for user {target_user_idx}...")
top_5 = generate_recommendations_vectorized(
 target_user_idx, df_filtered, similarity_matrix,
 user_to_book, book_to_user, user_book_to_rating, book_mean,
 k=25, top_n=5
)
print("\nTop 5 Recommendations for User", target_user_idx, ":")
for book_idx, pred_rating in top_5:
 title = get_book_title(book_idx)
 print(f"Title: {title} | Predicted Rating: {pred_rating:.2f}")

Computing similarity matrix...

Generating recommendations for user 25...

Top 5 Recommendations for User 25 :
Title: Alas, Babylon | Predicted Rating: 5.00
Title: The Strange Case of Dr. Jekyll and Mr. Hyde | Predicted Rating: 5.00
Title: Island Beneath the Sea | Predicted Rating: 5.00
Title: Φρικαντέλα: Η μάγισσα που μισούσε τα κάλαντα | Predicted Rating: 5.00
Title: The Sky Is Everywhere | Predicted Rating: 5.00


In [33]:
target_user_idx = 125 
print(f"\nGenerating recommendations for user {target_user_idx}...")
top_5 = generate_recommendations_vectorized(
 target_user_idx, df_filtered, similarity_matrix,
 user_to_book, book_to_user, user_book_to_rating, book_mean,
 k=25, top_n=5
)
print("\nTop 5 Recommendations for User", target_user_idx, ":")
for book_idx, pred_rating in top_5:
 title = get_book_title(book_idx)
 print(f"Title: {title} | Predicted Rating: {pred_rating:.2f}")


Generating recommendations for user 125...

Top 5 Recommendations for User 125 :
Title: Alas, Babylon | Predicted Rating: 5.00
Title: The Strange Case of Dr. Jekyll and Mr. Hyde | Predicted Rating: 5.00
Title: Island Beneath the Sea | Predicted Rating: 5.00
Title: Φρικαντέλα: Η μάγισσα που μισούσε τα κάλαντα | Predicted Rating: 5.00
Title: The Sky Is Everywhere | Predicted Rating: 5.00


In [34]:
#book_id_csv to book_id (real Goodreads ID)
csv_to_real = dict(zip(book_id_map['book_id_csv'], book_id_map['book_id']))

#book_id to title
realid_to_title = dict(zip(goodreads_books['book_id'], goodreads_books['title']))

def get_book_title(internal_idx):
    #internal index to book_id_csv
    book_id_csv = idx_to_book[internal_idx]

    #book_id_csv to real Goodreads book_id
    real_book_id = csv_to_real.get(book_id_csv)
    if real_book_id is None:
        return f"Unknown Book (csv:{book_id_csv})"

    #book_id to title lookup
    title = realid_to_title.get(real_book_id)
    if title is None:
        return f"Unknown Title ({real_book_id})"

    return title

In [35]:
print(book_id_map.head())
book_id_map.tail()

   book_id_csv   book_id
0            0  34684622
1            1  34536488
2            2  34017076
3            3     71730
4            4  30422361


Unnamed: 0,book_id_csv,book_id
2360645,2360645,19517100
2360646,2360646,18597299
2360647,2360647,18584882
2360648,2360648,18518801
2360649,2360649,18518607


In [36]:
book_id_map[book_id_map['book_id_csv'] == 59071]

Unnamed: 0,book_id_csv,book_id
59071,59071,38169


In [37]:
bookid_to_title[38169]

'Alas, Babylon'

In [38]:
def recommend_user_books(user_id, top_n=100):
    uidx = user_to_idx.get(user_id)
    if uidx is None:
        return []

    rated = set(R[uidx].nonzero()[1])
    candidates = [b for b in range(num_books) if b not in rated]
    
    preds = []
    for b in candidates:
        pr = predict_rating_fast(uidx, b, k=25)
        if pr is not None:
            preds.append((b, pr))
    
    preds.sort(key=lambda x: -x[1])
    return preds[:top_n]
    
def precision_at_k(recommended, relevant, k=10):
    rec_k = [r[0] for r in recommended[:k]]
    hits = sum(1 for x in rec_k if x in relevant)
    return hits / k
def recall_at_k(recommended, relevant, k=10):
    rec_k = [r[0] for r in recommended[:k]]
    hits = sum(1 for x in rec_k if x in relevant)
    return hits / len(relevant) if relevant else 0
def f1_at_k(recommended, relevant, k=10):
    p = precision_at_k(recommended, relevant, k)
    r = recall_at_k(recommended, relevant, k)
    return 2 * p * r / (p + r) if (p+r) > 0 else 0
def average_precision(recommended, relevant):
    hits = 0
    precisions = []
    for i, (b, _) in enumerate(recommended, start=1):
        if b in relevant:
            hits += 1
            precisions.append(hits / i)
    return sum(precisions) / len(relevant) if relevant else 0
def reciprocal_rank(recommended, relevant):
    for i, (b, _) in enumerate(recommended, start=1):
        if b in relevant:
            return 1 / i
    return 0
def reciprocal_rank(recommended, relevant):
    for i, (b, _) in enumerate(recommended, start=1):
        if b in relevant:
            return 1 / i
    return 0


In [39]:
example_user_idx = df_filtered['user_idx'].iloc[0]

recommended = generate_recommendations_vectorized(
    example_user_idx,
    df_filtered,
    similarity_matrix,
    user_to_book,
    book_to_user,
    user_book_to_rating,
    book_mean,
    k=25,
    top_n=50
)

user_rows = df_filtered[df_filtered['user_idx'] == example_user_idx]
relevant = set(user_rows[user_rows['rating'] >= 4]['book_idx'].values)

print("Precision@5:", precision_at_k(recommended, relevant, k=5))
print("Recall@5:", recall_at_k(recommended, relevant, k=5))
print("F1@5:", f1_at_k(recommended, relevant, k=5))
print("MAP:", average_precision(recommended, relevant))
print("MRR:", reciprocal_rank(recommended, relevant))


Precision@5: 0.0
Recall@5: 0
F1@5: 0
MAP: 0
MRR: 0


In [40]:
for b, p in recommended[:10]:
    print("Pred:", p, "| book_idx:", b, "| title:", get_book_title(b))


Pred: 5.0 | book_idx: 7 | title: Alas, Babylon
Pred: 5.0 | book_idx: 11 | title: The Strange Case of Dr. Jekyll and Mr. Hyde
Pred: 5.0 | book_idx: 14 | title: Island Beneath the Sea
Pred: 5.0 | book_idx: 19 | title: Φρικαντέλα: Η μάγισσα που μισούσε τα κάλαντα
Pred: 5.0 | book_idx: 23 | title: The Sky Is Everywhere
Pred: 5.0 | book_idx: 28 | title: Blood Hunter (The Vampire's Mage #3)
Pred: 5.0 | book_idx: 38 | title: Shrinking Violet
Pred: 5.0 | book_idx: 42 | title: Rock Chick Regret (Rock Chick, #7)
Pred: 5.0 | book_idx: 44 | title: Saving Evangeline
Pred: 5.0 | book_idx: 51 | title: Drantos (VLG, #1)


In [41]:
print("User idx:", example_user_idx)

user_rows = df_filtered[df_filtered['user_idx'] == example_user_idx]
print("User rated books:", len(user_rows))

relevant = set(user_rows[user_rows['rating'] >= 4]['book_idx'])

print("Relevant books (#rating>=4):", len(relevant))
print("Relevant book_idx values:", list(relevant)[:10])


User idx: 0
User rated books: 1
Relevant books (#rating>=4): 0
Relevant book_idx values: []


In [42]:
dense_users = df_filtered['user_idx'].value_counts()
print(dense_users.head(20))


user_idx
4205     24
4952     13
10632    12
11164    11
4158     11
3969     10
2455     10
7122     10
3472     10
3669     10
12139     9
11878     9
4017      9
4692      9
12065     9
16132     9
481       9
3452      8
9060      8
10808     8
Name: count, dtype: int64


In [43]:
example_user_idx = dense_users.index[0]   #the densest user
print("Evaluating user:", example_user_idx)


Evaluating user: 4205


In [44]:
user_rows = df_filtered[df_filtered['user_idx'] == example_user_idx]

relevant = set(user_rows[user_rows['rating'] >= 4]['book_idx'])

print("Relevant count:", len(relevant))

Relevant count: 24


In [45]:
example_user_idx = 4205

user_rows = df_filtered[df_filtered['user_idx'] == example_user_idx]
relevant = set(user_rows[user_rows['rating'] >= 4]['book_idx'])

print("User:", example_user_idx)
print("Total ratings:", len(user_rows))
print("Relevant (rating >= 4):", len(relevant))
print("Relevant books:", list(relevant)[:10])

recommended = recommend_user_books(example_user_idx, top_n=50)

print("\nTop recommendations with titles:")
for book_idx, score in recommended[:10]:
    print(book_idx, score, "|", get_book_title(book_idx))

# Convert recommendations list into ONLY book_idx
recommended_books = [b for b, s in recommended]

print("Precision@5:", precision_at_k(recommended_books, relevant, k=5))
print("Recall@5:", recall_at_k(recommended_books, relevant, k=5))
print("F1@5:", f1_at_k(recommended_books, relevant, k=5))
print("MAP:", average_precision(recommended_books, relevant))
print("MRR:", reciprocal_rank(recommended_books, relevant))


User: 4205
Total ratings: 24
Relevant (rating >= 4): 24
Relevant books: [3589, 17285, 35596, 13710, 16655, 1939, 17942, 25623, 2593, 32929]

Top recommendations with titles:
Precision@5: 0.0
Recall@5: 0.0
F1@5: 0
MAP: 0.0
MRR: 0


In [46]:
print("Non-zero similarities:", (similarity_matrix != 0).sum())
print("Matrix shape:", similarity_matrix.shape)

Non-zero similarities: 48357
Matrix shape: (48357, 48357)


In [47]:
interactions.head()

Unnamed: 0,user_id,book_id,is_read,rating,is_reviewed
0,0,948,1,5,0
1,0,947,1,5,1
2,0,946,1,5,0
3,0,945,1,5,0
4,0,944,1,5,0


In [48]:
interactions.shape

(228648342, 5)

In [49]:
import pandas as pd

path = "/kaggle/input/interactions/goodreads_interactions.csv"

chunksize = 5_000_000

user_counts = {}
book_counts = {}

for chunk in pd.read_csv(path, chunksize=chunksize):
    for uid, count in chunk['user_id'].value_counts().items():
        user_counts[uid] = user_counts.get(uid, 0) + count
    for bid, count in chunk['book_id'].value_counts().items():
        book_counts[bid] = book_counts.get(bid, 0) + count

print("Users counted:", len(user_counts))
print("Books counted:", len(book_counts))


Users counted: 876145
Books counted: 2360650


In [50]:
interactions.to_parquet("interactions_fast.parquet")

print("Saved as fast parquet file!")

Saved as fast parquet file!


In [2]:
import pandas as pd
interactions = pd.read_parquet("interactions_fast.parquet")

In [3]:
import pandas as pd
import pyarrow.parquet as pq

path = "/kaggle/working/interactions_fast.parquet"   

pq_file = pq.ParquetFile(path)

user_counts = {}
book_counts = {}

print("Counting interactions...")

for i in range(pq_file.num_row_groups):
    chunk = pq_file.read_row_group(i).to_pandas()

    #count user interactions
    uc = chunk['user_id'].value_counts()
    for uid, count in uc.items():
        user_counts[uid] = user_counts.get(uid, 0) + count

    #count book interactions
    bc = chunk['book_id'].value_counts()
    for bid, count in bc.items():
        book_counts[bid] = book_counts.get(bid, 0) + count

    print(f"Processed row group {i+1}/{pq_file.num_row_groups}")

print("\nUsers counted:", len(user_counts))
print("Books counted:", len(book_counts))


Counting interactions...
Processed row group 1/219
Processed row group 2/219
Processed row group 3/219
Processed row group 4/219
Processed row group 5/219
Processed row group 6/219
Processed row group 7/219
Processed row group 8/219
Processed row group 9/219
Processed row group 10/219
Processed row group 11/219
Processed row group 12/219
Processed row group 13/219
Processed row group 14/219
Processed row group 15/219
Processed row group 16/219
Processed row group 17/219
Processed row group 18/219
Processed row group 19/219
Processed row group 20/219
Processed row group 21/219
Processed row group 22/219
Processed row group 23/219
Processed row group 24/219
Processed row group 25/219
Processed row group 26/219
Processed row group 27/219
Processed row group 28/219
Processed row group 29/219
Processed row group 30/219
Processed row group 31/219
Processed row group 32/219
Processed row group 33/219
Processed row group 34/219
Processed row group 35/219
Processed row group 36/219
Processed ro

In [4]:
MIN_USER = 300
MIN_BOOK = 300

good_users = {u for u, c in user_counts.items() if c >= MIN_USER}
good_books = {b for b, c in book_counts.items() if c >= MIN_BOOK}

print("Selected dense users:", len(good_users))
print("Selected dense books:", len(good_books))


Selected dense users: 195534
Selected dense books: 102859


In [5]:
import pyarrow as pa
import pyarrow.parquet as pq

output_path = "goodreads_dense_subset.parquet"

writer = None

print("\nBuilding dense subset...")

for i in range(pq_file.num_row_groups):
    chunk = pq_file.read_row_group(i).to_pandas()
    #filter
    dense_chunk = chunk[
        chunk['user_id'].isin(good_users) &
        chunk['book_id'].isin(good_books)
    ]
    #convert to arrow table
    table = pa.Table.from_pandas(dense_chunk, preserve_index=False)
    #create writer on first batch
    if writer is None:
        writer = pq.ParquetWriter(output_path, table.schema)
    writer.write_table(table)

    print(f"Added dense rows from row group {i+1}/{pq_file.num_row_groups}")

if writer:
    writer.close()

print("\nDense subset saved to:", output_path)


Building dense subset...
Added dense rows from row group 1/219
Added dense rows from row group 2/219
Added dense rows from row group 3/219
Added dense rows from row group 4/219
Added dense rows from row group 5/219
Added dense rows from row group 6/219
Added dense rows from row group 7/219
Added dense rows from row group 8/219
Added dense rows from row group 9/219
Added dense rows from row group 10/219
Added dense rows from row group 11/219
Added dense rows from row group 12/219
Added dense rows from row group 13/219
Added dense rows from row group 14/219
Added dense rows from row group 15/219
Added dense rows from row group 16/219
Added dense rows from row group 17/219
Added dense rows from row group 18/219
Added dense rows from row group 19/219
Added dense rows from row group 20/219
Added dense rows from row group 21/219
Added dense rows from row group 22/219
Added dense rows from row group 23/219
Added dense rows from row group 24/219
Added dense rows from row group 25/219
Added de

In [6]:
df_dense = pd.read_parquet("/kaggle/working/goodreads_dense_subset.parquet")

print("Final Dense Subset Stats:")
print("Rows:", len(df_dense))
print("Users:", df_dense['user_id'].nunique())
print("Books:", df_dense['book_id'].nunique())


Final Dense Subset Stats:
Rows: 117874915
Users: 195528
Books: 102859


In [7]:
import pandas as pd
import pyarrow.parquet as pq

INTERACTIONS_PARQUET = "/kaggle/working/interactions_fast.parquet"

pq_file = pq.ParquetFile(INTERACTIONS_PARQUET)

book_counts = {}

print("Counting book frequencies...")

for i in range(pq_file.num_row_groups):
    chunk = pq_file.read_row_group(i).to_pandas()
    vc = chunk['book_id'].value_counts()
    for bid, cnt in vc.items():
        book_counts[bid] = book_counts.get(bid, 0) + cnt
    print(f"Processed row group {i+1}/{pq_file.num_row_groups}")

print("Total distinct books:", len(book_counts))


Counting book frequencies...
Processed row group 1/219
Processed row group 2/219
Processed row group 3/219
Processed row group 4/219
Processed row group 5/219
Processed row group 6/219
Processed row group 7/219
Processed row group 8/219
Processed row group 9/219
Processed row group 10/219
Processed row group 11/219
Processed row group 12/219
Processed row group 13/219
Processed row group 14/219
Processed row group 15/219
Processed row group 16/219
Processed row group 17/219
Processed row group 18/219
Processed row group 19/219
Processed row group 20/219
Processed row group 21/219
Processed row group 22/219
Processed row group 23/219
Processed row group 24/219
Processed row group 25/219
Processed row group 26/219
Processed row group 27/219
Processed row group 28/219
Processed row group 29/219
Processed row group 30/219
Processed row group 31/219
Processed row group 32/219
Processed row group 33/219
Processed row group 34/219
Processed row group 35/219
Processed row group 36/219
Processe

In [8]:
TOP_N_BOOKS = 5000 

top_books = sorted(book_counts.items(), key=lambda x: x[1], reverse=True)[:TOP_N_BOOKS]
top_book_ids = {bid for bid, cnt in top_books}

print("Top-N books selected:", len(top_book_ids))


Top-N books selected: 5000


In [9]:
import pyarrow as pa
import pyarrow.parquet as pq

output_path_top = "interactions_top_books.parquet"

writer = None

print("Filtering interactions to only top books...")

for i in range(pq_file.num_row_groups):
    chunk = pq_file.read_row_group(i).to_pandas()
    sub = chunk[chunk['book_id'].isin(top_book_ids)]

    if sub.empty:
        continue

    table = pa.Table.from_pandas(sub, preserve_index=False)
    if writer is None:
        writer = pq.ParquetWriter(output_path_top, table.schema)
    writer.write_table(table)

    print(f"Added rows from group {i+1}/{pq_file.num_row_groups}")

if writer is not None:
    writer.close()

print("Saved top-book subset to:", output_path_top)


Filtering interactions to only top books...
Added rows from group 1/219
Added rows from group 2/219
Added rows from group 3/219
Added rows from group 4/219
Added rows from group 5/219
Added rows from group 6/219
Added rows from group 7/219
Added rows from group 8/219
Added rows from group 9/219
Added rows from group 10/219
Added rows from group 11/219
Added rows from group 12/219
Added rows from group 13/219
Added rows from group 14/219
Added rows from group 15/219
Added rows from group 16/219
Added rows from group 17/219
Added rows from group 18/219
Added rows from group 19/219
Added rows from group 20/219
Added rows from group 21/219
Added rows from group 22/219
Added rows from group 23/219
Added rows from group 24/219
Added rows from group 25/219
Added rows from group 26/219
Added rows from group 27/219
Added rows from group 28/219
Added rows from group 29/219
Added rows from group 30/219
Added rows from group 31/219
Added rows from group 32/219
Added rows from group 33/219
Added ro

In [10]:
import pandas as pd
df_small = pd.read_parquet('/kaggle/working/interactions_top_books.parquet')
print("Shape after book filter:", df_small.shape)
print("Users:", df_small['user_id'].nunique())
print("Books:", df_small['book_id'].nunique())

MIN_USER_RATINGS = 500

# Keep only users with at least MIN_USER_RATINGS in this subset
good_users = df_small['user_id'].value_counts()
good_users = good_users[good_users >= MIN_USER_RATINGS].index

df_small = df_small[df_small['user_id'].isin(good_users)].copy()

print("Final CF dataset:")
print("Rows:", len(df_small))
print("Users:", df_small['user_id'].nunique())
print("Books:", df_small['book_id'].nunique())

# Save for future runs
df_small.to_parquet("goodreads_itemcf_small.parquet")


Shape after book filter: (76886953, 5)
Users: 831359
Books: 5000
Final CF dataset:
Rows: 10268687
Users: 14477
Books: 5000


In [11]:
import pandas as pd

df = pd.read_parquet("/kaggle/working/goodreads_itemcf_small.parquet")

print(df.shape)
df.head()

(10268687, 5)


Unnamed: 0,user_id,book_id,is_read,rating,is_reviewed
788,5,7071,0,0,0
789,5,7061,1,5,0
790,5,7057,1,5,0
791,5,7051,0,0,0
792,5,1611,0,0,0


In [12]:
df['user_idx'] = df['user_id'].astype('category').cat.codes
df['book_idx'] = df['book_id'].astype('category').cat.codes

user_to_id = dict(zip(df['user_idx'], df['user_id']))
book_to_id = dict(zip(df['book_idx'], df['book_id']))


In [13]:
from scipy.sparse import csr_matrix

num_users = df['user_idx'].nunique()
num_books = df['book_idx'].nunique()

R = csr_matrix(
    (df['rating'], (df['book_idx'], df['user_idx'])),
    shape=(num_books, num_users)
)


In [14]:
from sklearn.metrics.pairwise import cosine_similarity

print("Computing item-item similarity...")
user_means = df.groupby('user_idx')['rating'].mean()
df['rating_centered'] = df['rating'] - df['user_idx'].map(user_means)

R_centered = csr_matrix(
    (df['rating_centered'], (df['book_idx'], df['user_idx'])),
    shape=(num_books, num_users)
)

item_sim_dense = cosine_similarity(R_centered, dense_output=True)

import pickle
pickle.dump(item_sim_dense, open("item_similarity.pkl", "wb"))


Computing item-item similarity...


In [15]:
import numpy as np

def recommend_for_user(uidx, top_k=20, top_n=10):
    # Books user interacted with
    rated_books = R[:, uidx].nonzero()[0]

    if len(rated_books) == 0:
        return []
    scores = item_sim_dense[rated_books].sum(axis=0).A1
    scores[rated_books] = -np.inf
    top_books = np.argsort(scores)[-top_n:][::-1]

    return list(top_books)


In [16]:
def recommend_similar_books(book_idx, top_n=10):
    sims = np.asarray(item_sim_dense[book_idx]).ravel()
    sims[book_idx] = -np.inf
    top_books = np.argsort(sims)[-top_n:][::-1]
    return list(top_books)


In [17]:
df.head()

Unnamed: 0,user_id,book_id,is_read,rating,is_reviewed,user_idx,book_idx,rating_centered
788,5,7071,0,0,0,0,1504,-0.201111
789,5,7061,1,5,0,0,1503,4.798889
790,5,7057,1,5,0,0,1502,4.798889
791,5,7051,0,0,0,0,1501,-0.201111
792,5,1611,0,0,0,0,628,-0.201111


In [18]:
book_idx = df[df['book_id'] == 7071]['book_idx'].iloc[0]  
similar = recommend_similar_books(book_idx, 10)
similar


[1289, 1229, 973, 1709, 972, 1196, 1502, 1703, 601, 3392]

In [19]:
def hybrid_recommend(uidx, liked_book_idx, alpha=0.7, top_n=10):
    rated_books = R[:, uidx].nonzero()[0]
    
    if len(rated_books) > 0:
        user_scores = item_sim_dense[rated_books].sum(axis=0)
        user_scores = np.asarray(user_scores).ravel()   # <-- fix
    else:
        user_scores = np.zeros(num_books)

    #feedback CF (similar books to liked_book_idx)
    feedback_scores = item_sim_dense[liked_book_idx].ravel()
    #Combine
    combined = alpha * user_scores + (1 - alpha) * feedback_scores

    #Remove rated books + the liked book itself
    combined[rated_books] = -np.inf
    combined[liked_book_idx] = -np.inf

    #Top-N
    top_books = np.argsort(combined)[-top_n:][::-1]
    return list(top_books)


In [20]:
uidx = 25
liked_book_idx = 133  # user entered "I enjoyed book X"

hybrid = hybrid_recommend(uidx, liked_book_idx, alpha=0.7)
hybrid


[3499, 1499, 3646, 1512, 604, 1803, 1511, 1451, 4539, 1498]

In [21]:
def get_title_from_bookidx(book_idx):
    real_id = book_to_id[book_idx]
    return bookid_to_title.get(real_id, f"Unknown Title ({real_id})")


In [22]:
import pickle

TITLE_PATH = "/kaggle/working/bookid_to_title.pkl"

with open(TITLE_PATH, "rb") as f:
    bookid_to_title = pickle.load(f)

print("Loaded title dictionary with", len(bookid_to_title), "entries.")


Loaded title dictionary with 2360655 entries.


In [25]:
book_idx_to_bookid = dict(zip(book_id_map['book_id_csv'], book_id_map['book_id']))


In [26]:
def get_title_from_idx(book_idx):
    bookid = book_idx_to_bookid.get(book_idx)
    if bookid in bookid_to_title:
        return bookid_to_title[bookid]

    return f"Unknown Title ({book_idx})"


In [27]:
def hybrid_recommend_titles(uidx, liked_book_idx, alpha=0.7, top_n=10):
    # CF base recommendation
    cf_scores = item_sim_dense[uidx].ravel()

    feedback_scores = item_sim_dense[liked_book_idx].ravel()
    hybrid_score = alpha * cf_scores + (1 - alpha) * feedback_scores
    top_books = hybrid_score.argsort()[::-1][:top_n]

    results = []
    for bidx in top_books:
        title = get_title_from_idx(bidx)
        score = hybrid_score[bidx]
        results.append((bidx, title, score))

    return results


In [28]:
for b in [10, 50, 100, 500, 1000]:
    print(b, np.sum(item_sim_dense[b]))


10 -23.758877326166903
50 22.751487196038568
100 21.681079529804826
500 -21.659996569159123
1000 14.101559512086425


In [29]:
uidx = 30
liked_book_idx = 132  # book user clicked “I like this”

recommendations = hybrid_recommend_titles(uidx, liked_book_idx, alpha=0.7, top_n=5)

print("\nHybrid Recommendations WITH Titles:\n")
for bidx, title, score in recommendations:
    print(f"{bidx}: {title} (Score: {score:.4f})")



Hybrid Recommendations WITH Titles:

30: The Sympathizer (Score: 0.6908)
132: The Intelligent Investor (Score: 0.2785)
1748: Burnt Sugar (Firebug, #0.5) (Score: 0.1324)
1813: Combustion Hour (Score: 0.1090)
1761: Shade (Jumper, #2.5) (Score: 0.1055)


In [30]:
def precision_at_k(recommended, relevant, k):
    recommended_k = recommended[:k]
    hits = sum(1 for item in recommended_k if item in relevant)
    return hits / k
def recall_at_k(recommended, relevant, k):
    if len(relevant) == 0:
        return 0
    recommended_k = recommended[:k]
    hits = sum(1 for item in recommended_k if item in relevant)
    return hits / len(relevant)
def f1_at_k(recommended, relevant, k):
    p = precision_at_k(recommended, relevant, k)
    r = recall_at_k(recommended, relevant, k)
    if p + r == 0:
        return 0
    return 2 * p * r / (p + r)
def average_precision(recommended, relevant):
    if len(relevant) == 0:
        return 0

    score = 0
    hits = 0

    for i, item in enumerate(recommended, 1):
        if item in relevant:
            hits += 1
            score += hits / i

    return score / len(relevant)
def reciprocal_rank(recommended, relevant):
    for i, item in enumerate(recommended, 1):
        if item in relevant:
            return 1 / i
    return 0
from scipy.stats import kendalltau

def kendall_tau(recommended, ground_truth_ranked):
    # Convert both lists into ranks over same items
    if len(recommended) != len(ground_truth_ranked):
        L = min(len(recommended), len(ground_truth_ranked))
        recommended = recommended[:L]
        ground_truth_ranked = ground_truth_ranked[:L]
    return kendalltau(recommended, ground_truth_ranked).correlation


In [33]:
uidx = 4205  
liked_book_idx = 133
hybrid_recs = hybrid_recommend(uidx, liked_book_idx, alpha=0.7, top_n=50)

# extract only book_idx list
recommended = [b for b, _, _ in hybrid_recommend_titles(uidx, liked_book_idx, 0.7, 50)]
recommended

[4205,
 133,
 307,
 182,
 309,
 304,
 1927,
 305,
 318,
 308,
 258,
 1674,
 139,
 235,
 241,
 433,
 1660,
 324,
 110,
 2723,
 3817,
 234,
 2218,
 200,
 159,
 357,
 1621,
 2726,
 310,
 1656,
 3276,
 245,
 2724,
 2725,
 505,
 339,
 3383,
 264,
 2202,
 2211,
 1658,
 113,
 351,
 198,
 127,
 4822,
 246,
 199,
 175,
 3464]

In [37]:
print("Precision@5:", precision_at_k(recommended, relevant, 5))
print("Recall@5:", recall_at_k(recommended, relevant, 5))
print("F1@5:", f1_at_k(recommended, relevant, 5))

print("MAP:", average_precision(recommended, relevant))
print("MRR:", reciprocal_rank(recommended, relevant))


Precision@5: 0.0
Recall@5: 0
F1@5: 0
MAP: 0
MRR: 0
