In [1]:
import pandas as pd

df = pd.read_csv("../data/clean_openlibrary_books.csv")
df.head()

Unnamed: 0,work_key,title,authors,subjects,first_publish_year,cover_url,main_author,clean_subjects
0,/works/OL66554W,Pride and Prejudice,Jane Austen,"Fiction, Romance, Historical, Regency, British...",1813,https://covers.openlibrary.org/b/id/14348537-L...,Jane Austen,"fiction, romance, historical"
1,/works/OL138052W,Alice's Adventures in Wonderland,Lewis Carroll,"Alice (fictitious character : carroll), fictio...",1865,https://covers.openlibrary.org/b/id/10527843-L...,Lewis Carroll,"alice (fictitious character : carroll), fictio..."
2,/works/OL8193416W,The Picture of Dorian Gray,Oscar Wilde,British and irish fiction (fictional works by ...,1890,https://covers.openlibrary.org/b/id/14314858-L...,Oscar Wilde,british and irish fiction (fictional works by ...
3,/works/OL21177W,Wuthering Heights,Emily Brontë,British and irish fiction (fictional works by ...,1846,https://covers.openlibrary.org/b/id/12818862-L...,Emily Brontë,british and irish fiction (fictional works by ...
4,/works/OL8193497W,A Christmas Carol,Charles Dickens,"Ghost stories, Readers, Ebenzer Scrooge (Ficti...",1843,https://covers.openlibrary.org/b/id/13299222-L...,Charles Dickens,"ghost stories, readers, ebenzer scrooge (ficti..."


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load & build content‑based recommender

df["text"] = (
    df["title"].fillna("") + " " +
    df["main_author"].fillna("") + " " +
    df["subjects"].fillna("")
)

tfidf = TfidfVectorizer(stop_words="english", min_df=2)
X = tfidf.fit_transform(df["text"])

title_to_idx = {t.lower(): i for i, t in enumerate(df["title"].astype(str))}

def recommend_by_title(title, k=10):
    idx = title_to_idx[title.lower()]
    sims = cosine_similarity(X[idx], X).ravel()
    order = sims.argsort()[::-1]
    recs = [j for j in order if j != idx][:k]
    return df.iloc[recs][["title","main_author","subjects"]]

In [3]:
# Personalized content — “tell me books like the ones I like”

def recommend_for_likes_content(liked_titles, k=10):
    idxs = [title_to_idx[t.lower()] for t in liked_titles]
    profile = X[idxs].mean(axis=0)
    scores = (profile @ X.T).A1
    for i in idxs: scores[i] = -1e9  # exclude liked books
    recs = scores.argsort()[::-1][:k]
    return df.iloc[recs][["title","main_author","subjects"]]

In [7]:
# Collaborative filtering (item‑item, with synthetic users now)

import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

n_users = 5               
n_items = len(df)         
rng = np.random.default_rng(42)

# create a fake user-item matrix: each user likes 20 random books
rows, cols, data = [], [], []
for u in range(n_users):
    liked = rng.choice(n_items, size=20, replace=False)
    for i in liked:
        rows.append(u)
        cols.append(i)
        data.append(1)

UI_train = csr_matrix((data, (rows, cols)), shape=(n_users, n_items))

# item-item similarity
item_sim_cf = cosine_similarity(UI_train.T)

def recommend_cf_for_user(u, k=10):
    user_vec = UI_train[u].toarray().ravel()
    scores = item_sim_cf.dot(user_vec)
    scores[user_vec > 0] = -1e9  # exclude already-seen
    recs = scores.argsort()[::-1][:k]
    return df.iloc[recs][["title","main_author","subjects"]]

# try recommendations for user 0
recommend_cf_for_user(0, k=5)

Unnamed: 0,title,main_author,subjects
1131,Columbus,Washington Irving,"Exploring expeditions, Discovery and explorati..."
396,Cards on the Table,Agatha Christie,"Fiction, Hercule Poirot (Fictitious character)..."
1368,The Theory of the Leisure Class,Thorstein Veblen,"Leisure class, Philosophy, Economics, Business..."
477,Player Piano,Kurt Vonnegut,"Fiction, Classic Literature, Mystery, Science ..."
717,Black Canaan,Robert E. Howard,Science fiction


In [8]:
# Evaluate (accuracy & relevance)

import numpy as np

def precision_recall_at_k(recs, test_set, k=10):
    topk = recs[:k]
    hit = set(topk) & set(test_set)
    p = len(hit) / k
    r = len(hit) / max(1, len(test_set))
    return p, r

def ndcg_at_k(recs, test_set, k=10):
    dcg = sum((1.0 if item in test_set else 0.0) / np.log2(i+2) for i, item in enumerate(recs[:k]))
    ideal = min(k, len(test_set))
    idcg = sum(1.0 / np.log2(i+2) for i in range(ideal)) if ideal else 0.0
    return dcg / idcg if idcg else 0.0

In [9]:
df.shape, df.columns.tolist()

((1658, 9),
 ['work_key',
  'title',
  'authors',
  'subjects',
  'first_publish_year',
  'cover_url',
  'main_author',
  'clean_subjects',
  'text'])