In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [4]:
# Load data
df = pd.read_csv("../datasets/joined_complete_info.csv")
df = df.drop_duplicates(subset=['join_title', 'book_desc'], keep='first')
print('columns', df.columns)

df = df[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication',
         'Publisher', 'book_desc', 'book_edition', 'book_format', 'book_pages', 
         'book_rating_count', 'genres', 'book_price', 'language_code', 'join_title']]

df.set_index('join_title', inplace=True)

print(df['book_desc'].head(10))
print("NAs:", df['book_desc'].isna().sum()) # drop? 
print("Empty rows:", (df['book_desc'].str.strip() == "").sum())

df.to_csv('../datasets/book_info_with_tfidf.csv', index=True)

columns Index(['User-ID', 'ISBN', 'Book-Rating', 'User_Location', 'User_Age',
       'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'join_title', 'book_desc', 'book_edition', 'book_format', 'book_pages',
       'book_rating', 'book_rating_count', 'book_review_count', 'genres',
       'book_price', 'language_code'],
      dtype='object')
join_title
a painted house                                    Until that September of 1952, Luke Chandler ha...
lightning                                           "A gripping novel" from the #1 New York Times...
the dark half                                      Thad Beaumont would like to say he is innocent...
anna karenina                                      Acclaimed by many as the world's greatest nove...
the picture of dorian gray                         ï»¿Written in his distinctively dazzling manner,...
alaska                                             In this sweeping epic of the northernmost Amer...
the wonderful sto

In [7]:
desc_col = "book_desc" 
df[desc_col] = df[desc_col].fillna("")

# Fit TF-IDF on ALL descriptions
vectorizer = TfidfVectorizer(
    max_features=8000,
    stop_words="english",
    ngram_range=(1, 1),
)

tfidf_matrix = vectorizer.fit_transform(df[desc_col])

df["TFIDF_Embeddings_Vocab_8000"] = [
    tfidf_matrix[i].toarray().flatten()
    for i in range(tfidf_matrix.shape[0])
]

for i in range(10):
    print(tfidf_matrix[i].count_nonzero())

57
34
27
58
52
95
42
67
50
50


In [8]:
unmasked_df = pd.read_csv("../datasets/train_edges.csv")

user_books = unmasked_df.groupby('User-ID').apply(
    lambda x: list(zip(x['join_title'], x['Book-Rating']))
).to_dict()

  user_books = unmasked_df.groupby('User-ID').apply(


In [9]:
user_to_embed = {}
for user_id, books_ratings in user_books.items():

    high_rated_unmasked = [(join_title, rating) for join_title, rating in books_ratings if rating > 7]

    if len(high_rated_unmasked) == 0:
        continue

    print(f"User {user_id} has rated {len(high_rated_unmasked)} books highly.")

    unmasked_indices = [df.index.get_loc(join_title) for join_title, _ in high_rated_unmasked]
    user_embedding = tfidf_matrix[unmasked_indices].mean(axis=0)
    user_to_embed[user_id] = np.asarray(user_embedding).flatten()

user_embeddings_df = pd.DataFrame.from_dict(
    user_to_embed, 
    orient='index'
)
user_embeddings_df.index.name = 'User-ID'
user_embeddings_df.to_csv('../datasets/user_tfidf_embeddings.csv', index=True)

User 26 has rated 1 books highly.
User 51 has rated 1 books highly.
User 53 has rated 1 books highly.
User 91 has rated 1 books highly.
User 114 has rated 1 books highly.
User 183 has rated 1 books highly.
User 185 has rated 1 books highly.
User 242 has rated 1 books highly.
User 243 has rated 1 books highly.
User 244 has rated 2 books highly.
User 254 has rated 5 books highly.
User 256 has rated 1 books highly.
User 387 has rated 1 books highly.
User 388 has rated 1 books highly.
User 413 has rated 1 books highly.
User 493 has rated 1 books highly.
User 500 has rated 1 books highly.
User 507 has rated 1 books highly.
User 595 has rated 1 books highly.
User 638 has rated 6 books highly.
User 651 has rated 1 books highly.
User 685 has rated 1 books highly.
User 735 has rated 1 books highly.
User 746 has rated 1 books highly.
User 763 has rated 1 books highly.
User 769 has rated 1 books highly.
User 777 has rated 1 books highly.
User 805 has rated 4 books highly.
User 828 has rated 1 boo

In [10]:
recommendations = {}
k = 50

for user_id, user_embedding in user_to_embed.items():
    # reshape user embedding for cosine_similarity (needs 2D array)
    user_emb_2d = user_embedding.reshape(1, -1)
    
    # compute cosine similarity between user and all books
    similarities = cosine_similarity(user_emb_2d, tfidf_matrix).flatten()
    
    # get the unmasked books for this user
    unmasked_titles = {title for title, _ in user_books[user_id]}
    
    # create a list of (similarity, index, title) for books NOT in unmasked
    candidate_books = [
        (similarities[idx], idx, title)
        for idx, title in enumerate(df.index)
        if title not in unmasked_titles
    ] # because unmasked contains books that we know the user has read/rated
    
    # sort by similarity (descending) and get top k
    candidate_books.sort(reverse=True, key=lambda x: x[0])
    top_k = candidate_books[:k]
    
    # Extract just the titles
    recommendations[user_id] = [title for _, _, title in top_k]

# view sample recommendations
sample_user = list(recommendations.keys())[0]
print(f"Top {k} recommendations for user {sample_user}:")
for i, book in enumerate(recommendations[sample_user], 1):
    print(f"{i}. {book}")

Top 50 recommendations for user 26:
1. follow your heart
2. an american childhood
3. three tales
4. savage inequalities: children in america's schools
5. journey to the end of the night
6. hiroshima
7. a tree grows in brooklyn
8. the grapes of wrath
9. the amateur marriage
10. the prophet
11. black boy
12. the art of loving
13. the moviegoer
14. the house on mango street
15. wild swans: three daughters of china
16. autobiography of a yogi
17. endymion
18. the rainmaker
19. the secret garden
20. shopgirl
21. the color purple
22. different seasons
23. a portrait of the artist as a young man
24. how to win friends and influence people
25. youngblood hawke
26. the feminine mystique
27. ferdydurke
28. all about love: new visions
29. black elk speaks: being the life story of a holy man of the oglala sioux
30. katherine
31. angela's ashes
32. never let me go
33. sula
34. arctic dreams
35. cancer ward
36. of mice and men
37. the adolescent
38. main street
39. a personal matter
40. suzanne's di

In [11]:
recommendations

{26: ['follow your heart',
  'an american childhood',
  'three tales',
  "savage inequalities: children in america's schools",
  'journey to the end of the night',
  'hiroshima',
  'a tree grows in brooklyn',
  'the grapes of wrath',
  'the amateur marriage',
  'the prophet',
  'black boy',
  'the art of loving',
  'the moviegoer',
  'the house on mango street',
  'wild swans: three daughters of china',
  'autobiography of a yogi',
  'endymion',
  'the rainmaker',
  'the secret garden',
  'shopgirl',
  'the color purple',
  'different seasons',
  'a portrait of the artist as a young man',
  'how to win friends and influence people',
  'youngblood hawke',
  'the feminine mystique',
  'ferdydurke',
  'all about love: new visions',
  'black elk speaks: being the life story of a holy man of the oglala sioux',
  'katherine',
  "angela's ashes",
  'never let me go',
  'sula',
  'arctic dreams',
  'cancer ward',
  'of mice and men',
  'the adolescent',
  'main street',
  'a personal matter',


In [12]:
recommendations = {str(k): v for k, v in recommendations.items()}
with open('../results/content_based_filtering_recommendations.pkl', 'wb') as f:
    pickle.dump(recommendations, f)

In [13]:
masked_books = pd.read_csv("../datasets/test_edges.csv")

In [14]:
# quick sanity check eval
user_precisions = []
user_recalls = []
total_hits = 0
total_users = 0
total_relevant = 0

for user_id, recommended_books in recommendations.items():

    if int(user_id) not in masked_books['User-ID'].values:
        continue
    
    user_masked_books = masked_books[masked_books['User-ID'] == int(user_id)]
    relevant_books = {row['join_title'] for _, row in user_masked_books.iterrows() if row['Book-Rating'] > 7}
    
    if len(relevant_books) == 0:
        continue
    
    # count how many recommended books are relevant
    user_hits = len(set(recommended_books) & relevant_books)
    total_hits += user_hits
    total_users += 1
    total_relevant += len(relevant_books)
    
    # calculate per-user metrics
    user_precision = user_hits / 20
    user_recall = user_hits / len(relevant_books)
    
    user_precisions.append(user_precision)
    user_recalls.append(user_recall)

# Calculate overall averages
avg_precision = sum(user_precisions) / len(user_precisions) if user_precisions else 0
avg_recall = sum(user_recalls) / len(user_recalls) if user_recalls else 0

print(f"\n=== Overall Results ===")
print(f"Average Precision@20: {avg_precision:.4f}")
print(f"Average Recall@20: {avg_recall:.4f}")
print(f"Total hits: {total_hits}")
print(f"Total users evaluated: {total_users}")
print(f"Total relevant books: {total_relevant}")


=== Overall Results ===
Average Precision@20: 0.0097
Average Recall@20: 0.0886
Total hits: 548
Total users evaluated: 2819
Total relevant books: 6231
