In [1]:
import os, time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


df = pd.read_csv("../data/clean_openlibrary_books.csv")


In [2]:
# Build a simple text field (title + author + subjects)
df["text"] = (
    df["title"].fillna("") + " " +
    df["main_author"].fillna("") + " " +
    df["subjects"].fillna("")
)

# TF-IDF fit
tfidf = TfidfVectorizer(stop_words="english", min_df=2)
X = tfidf.fit_transform(df["text"])   # items x features
title_to_idx = {t.lower(): i for i, t in enumerate(df["title"].astype(str))}

def recommend_by_title(title, k=10):
    """Similar books to a given title (content-based)."""
    idx = title_to_idx[title.lower()]
    sims = cosine_similarity(X[idx], X).ravel()
    order = sims.argsort()[::-1]
    recs = [j for j in order if j != idx][:k]
    out = df.iloc[recs][["title","main_author","subjects","cover_url"]].copy()
    out["content_score"] = sims[recs]
    return out

# demo
recommend_by_title(df.sample(1, random_state=0)["title"].iloc[0], k=5)

Unnamed: 0,title,main_author,subjects,cover_url,content_score
1322,Tales of Mystery and Imagination [46 stories],Edgar Allan Poe,"short stories, aristocracy, satire, American H...",https://covers.openlibrary.org/b/id/8245290-L.jpg,0.95369
1235,The Works of Edgar Allan Poe in Five Volumes,Edgar Allan Poe,"short stories, aristocracy, satire, American H...",https://covers.openlibrary.org/b/id/8311524-L.jpg,0.865273
1292,The Fall of the House of Usher and Other Tales...,Edgar Allan Poe,"American Horror tales, American literature, Ch...",https://covers.openlibrary.org/b/id/11695570-L...,0.865
1240,Great Short Stories of the World -- a collecti...,Barrett Harper Clark,"Short stories, American Horror tales, American...",https://covers.openlibrary.org/b/id/14540542-L...,0.740632
1274,The Tell-Tale Heart,Edgar Allan Poe,"Asesinato, Cuentos de terror estadounidenses, ...",https://covers.openlibrary.org/b/id/11851436-L...,0.693547


In [3]:
# Cold‑start: user picks favorite titles → content profile

def recommend_from_favorites(favorite_titles, k=10):
    """Build a content profile from 3–5 favorite titles and recommend."""
    idxs = [title_to_idx[t.lower()] for t in favorite_titles if t.lower() in title_to_idx]
    if not idxs:
        raise ValueError("None of the provided titles found in the dataset.")
    profile = X[idxs].mean(axis=0)                  # 1 x F
    scores = (profile @ X.T).A1                     # 1 x N -> vector
    for i in idxs:                                  # exclude favorites themselves
        scores[i] = -1e9
    recs = scores.argsort()[::-1][:k]
    out = df.iloc[recs][["title","main_author","subjects","cover_url"]].copy()
    out["content_score"] = scores[recs]
    return out

# try it:
my_favs = df.sample(3, random_state=1)["title"].tolist()
my_favs, recommend_from_favorites(my_favs, k=5)

(['Beauvallet (Beauvallet Dynasty #2)', 'The Age of Innocence', 'Prey'],
                                            title      main_author  \
 825  Simon the Coldheart (Beauvallet Dynasty #1)  Georgette Heyer   
 857                                 The Nonesuch  Georgette Heyer   
 801                                 Royal Escape  Georgette Heyer   
 879                                  Black Sheep  Georgette Heyer   
 813                                The Conqueror  Georgette Heyer   
 
                                               subjects  \
 825  Fiction, Man-woman relationships, Knights and ...   
 857  Open Library Staff Picks, Fiction, Romance, Hi...   
 801  1930s, Fiction, Historical, Historical Fiction...   
 879  1960s, Historical, History, Fiction, Romance, ...   
 813  1930s, Fiction, Kings and rulers, Historical F...   
 
                                              cover_url  content_score  
 825  https://covers.openlibrary.org/b/id/6920126-L.jpg       0.271811  
 85

In [4]:
# Feedback logging (click/like) and building a user–item matrix
# A small CSV to store feedback (append-only)

INTERACTIONS_CSV = "interactions.csv"
if not os.path.exists(INTERACTIONS_CSV):
    pd.DataFrame(columns=["user_id","work_key","event_type","timestamp"]).to_csv(INTERACTIONS_CSV, index=False)

def append_feedback(user_id, work_key, event_type="click", path=INTERACTIONS_CSV):
    ts = int(time.time())
    row = pd.DataFrame([{"user_id": user_id, "work_key": work_key, "event_type": event_type, "timestamp": ts}])
    if os.path.exists(path):
        row.to_csv(path, mode="a", header=False, index=False)
    else:
        row.to_csv(path, index=False)

def build_ui_matrix(df, interactions_path=INTERACTIONS_CSV, positive_events=("click","like")):
    """
    Build a user–item matrix from the feedback CSV.
    Returns: UI_train (users x items sparse), user_to_idx, item_to_idx
    """
    if not os.path.exists(interactions_path):
        return csr_matrix((0, len(df))), {}, {}
    logs = pd.read_csv(interactions_path)
    if logs.empty:
        return csr_matrix((0, len(df))), {}, {}

    users = sorted(logs["user_id"].astype(str).unique().tolist())
    user_to_idx = {u:i for i,u in enumerate(users)}
    item_to_idx = {k:i for i,k in enumerate(df["work_key"].astype(str))}

    rows, cols, data = [], [], []
    for _, r in logs.iterrows():
        if r["event_type"] not in positive_events:
            continue
        u = str(r["user_id"]); k = str(r["work_key"])
        if u in user_to_idx and k in item_to_idx:
            rows.append(user_to_idx[u]); cols.append(item_to_idx[k]); data.append(1)

    if not rows:  # still no positives
        return csr_matrix((0, len(df))), user_to_idx, item_to_idx

    UI = csr_matrix((data, (rows, cols)), shape=(len(user_to_idx), len(df)))
    return UI, user_to_idx, item_to_idx

# build (initially empty unless you've logged something)
UI_train, user_to_idx, item_to_idx = build_ui_matrix(df)
UI_train.shape

(0, 1658)

In [6]:
# Tiny evaluation (optional): Precision@k / Recall@k / nDCG@k

# --- CF helper definitions & rebuild, to ensure they exist in this session ---

import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

# If you don't already have these from earlier cells:
# UI_train, user_to_idx, item_to_idx = build_ui_matrix(df)

# Safety: build again in case it wasn't built
UI_train, user_to_idx, item_to_idx = build_ui_matrix(df)

# Item–item similarity (only if we have at least one user row)
item_sim_cf = cosine_similarity(UI_train.T) if UI_train.shape[0] > 0 else None

def recommend_cf_for_user(user_id, k=10):
    """
    Recommend using CF (item-item with cosine similarity) based on a user's past clicks.
    Returns a DataFrame with work_key, title, author, subjects, cover_url, cf_score.
    """
    if item_sim_cf is None or user_id not in user_to_idx:
        # No feedback yet or unknown user
        return pd.DataFrame(columns=["work_key","title","main_author","subjects","cover_url","cf_score"])

    u = user_to_idx[user_id]
    user_vec = UI_train[u].toarray().ravel()
    scores = item_sim_cf.dot(user_vec)
    scores[user_vec > 0] = -1e9  # exclude already seen items

    # Top-k indices
    recs = scores.argsort()[::-1][:k]
    out = df.iloc[recs][["work_key","title","main_author","subjects","cover_url"]].copy()
    out["cf_score"] = scores[recs]
    return out


In [7]:
# get CF recs and compute proxy metrics
if UI_train.shape[0] > 0:
    cf_out = recommend_cf_for_user("test_user", k=10)
    idx_from_wk = {wk:i for i,wk in enumerate(df["work_key"])}
    rec_indices = [idx_from_wk[wk] for wk in cf_out["work_key"]]
    p, r = precision_recall_at_k(rec_indices, test, k=10)
    n = ndcg_at_k(rec_indices, test, k=10)
    print("Synthetic CF — Precision@10:", round(p,3), "Recall@10:", round(r,3), "nDCG@10:", round(n,3))
else:
    print("Not enough feedback to compute CF metrics yet.")


Synthetic CF — Precision@10: 0.0 Recall@10: 0.0 nDCG@10: 0.0
