In [1]:
from faker import Faker
import random
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize Faker and seeds
faker = Faker()
random.seed(42)
np.random.seed(42)

# ---------------------
# 1. Generate Users
# ---------------------
def generate_users(num_users=50):
    users = []
    for i in range(num_users):
        user_id = f"u{i:03d}"
        profile = faker.job() + " who enjoys " + faker.word()
        reviews = [faker.sentence(nb_words=10) for _ in range(random.randint(2, 4))]
        ratings = [random.randint(1, 5) for _ in range(len(reviews))]
        users.append({
            "user_id": user_id,
            "profile": profile,
            "reviews": reviews,
            "ratings": ratings
        })
    return pd.DataFrame(users)

# ---------------------
# 2. Generate Books
# ---------------------
def generate_books(num_books=100):
    books = []
    for i in range(num_books):
        book_id = f"b{i:03d}"
        title = faker.sentence(nb_words=4).rstrip('.')
        description = faker.paragraph(nb_sentences=3)
        books.append({
            "book_id": book_id,
            "title": title,
            "description": description
        })
    return pd.DataFrame(books)

# ---------------------
# 3. Compute Embeddings
# ---------------------
def compute_embeddings(model, texts):
    return model.encode(texts, convert_to_tensor=True)

# ---------------------
# 4. Recommend Books
# ---------------------
def recommend_books(user_df, book_df, model, top_k=3):
    recommendations = {}

    # Prepare book embeddings
    book_texts = (book_df["title"] + " " + book_df["description"]).tolist()
    book_embeddings = compute_embeddings(model, book_texts)
    book_embeddings_cpu = book_embeddings.cpu().numpy()

    for idx, user in user_df.iterrows():
        # Combine profile and reviews
        user_text = user["profile"] + " " + " ".join(user["reviews"])
        user_embedding = compute_embeddings(model, [user_text])[0].unsqueeze(0).cpu().numpy()

        # Compute cosine similarity
        similarities = cosine_similarity(user_embedding, book_embeddings_cpu)[0]

        # Recommend top-k (ignoring previously reviewed books)
        top_indices = np.argsort(similarities)[::-1]
        recommendations[user["user_id"]] = [
            book_df.iloc[i]["book_id"]
            for i in top_indices[:top_k]
        ]

    return recommendations


# ---------------------
# 5. Full Run
# ---------------------
def main():
    print("Generating datasets...")
    users = generate_users()
    print(users.head(10))

    books = generate_books()
    users.to_csv("users.csv", index=False)
    books.to_csv("books.csv", index=False)

    print("Loading embedding model...")
    model = SentenceTransformer("all-MiniLM-L6-v2")

    print("Recommending books...")
    recs = recommend_books(users, books, model)

    print("\nSample Recommendations:")
    for user_id, book_ids in list(recs.items())[:5]:
        print(f"User {user_id} → Books: {book_ids}")

    return users, books, recs

if __name__ == "__main__":
    users_df, books_df, recommendations = main()


  from .autonotebook import tqdm as notebook_tqdm


Generating datasets...
  user_id                                            profile  \
0    u000  Radiation protection practitioner who enjoys s...   
1    u001                    Town planner who enjoys college   
2    u002                      Art therapist who enjoys land   
3    u003               Recycling officer who enjoys usually   
4    u004  Training and development officer who enjoys truth   
5    u005                     Comptroller who enjoys already   
6    u006           Clinical embryologist who enjoys western   
7    u007  Armed forces training and education officer wh...   
8    u008                         Dealer who enjoys election   
9    u009             Occupational therapist who enjoys bill   

                                             reviews       ratings  
0  [Mr why myself still value investment finish f...  [1, 1, 3, 2]  
1  [Between too one according challenge identify ...        [2, 1]  
2  [Here become contain keep hard may direction c...  [5, 1, 5, 4

# exam

In [2]:
# Book Recommender - Student Version
# Final Project — Embeddings and Semantic Understanding

from faker import Faker
import random
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
# Optional: from sklearn.metrics.pairwise import cosine_similarity

# === NOTES FOR ALL PLATFORMS ===
# - If you are on Windows, make sure you install dependencies with:
#     pip install pandas faker sentence-transformers scikit-learn
#
# - If you're on Mac and using Apple Silicon (M1/M2/M3), you may face issues
#   if your model runs on 'mps' (Metal Performance Shaders).
#   This can break compatibility with sklearn. If that happens, use .cpu().numpy()
#   or just force the model to run on CPU by:
#       import torch; torch.device("cpu")
#   and avoid `.to("mps")` or `.to("cuda")`.

# ---------------------
# STEP 1. Generate Users
# ---------------------

def generate_users(num_users=50):
    faker = Faker()
    random.seed(42)
    users = []
    for i in range(num_users):
        user_id = f"u{i:03d}"
        profile = faker.job() + " who enjoys " + faker.word()
        reviews = [faker.sentence(nb_words=10) for _ in range(random.randint(2, 4))]
        ratings = [random.randint(1, 5) for _ in range(len(reviews))]
        users.append({
            "user_id": user_id,
            "profile": profile,
            "reviews": reviews,
            "ratings": ratings
        })
    return pd.DataFrame(users)

# ---------------------
# STEP 2. Generate Books
# ---------------------

def generate_books(num_books=100):
    faker = Faker()
    books = []
    for i in range(num_books):
        book_id = f"b{i:03d}"
        title = faker.sentence(nb_words=4).rstrip('.')
        description = faker.paragraph(nb_sentences=3)
        books.append({
            "book_id": book_id,
            "title": title,
            "description": description
        })
    return pd.DataFrame(books)

# ---------------------
# STEP 3. Save Data to CSV (Optional for viewing)
# ---------------------

def save_datasets(users, books):
    users.to_csv("users.csv", index=False)
    books.to_csv("books.csv", index=False)

# ---------------------
# STEP 4. Recommender Logic (TO COMPLETE)
# ---------------------

def recommend_books(user_df, book_df, model, top_k=3):
      from sklearn.metrics.pairwise import cosine_similarity
      import torch

      recommendations = {}

      # Prepare book input text: title + description
      book_texts = (book_df["title"] + " " + book_df["description"]).tolist()

      # Get book embeddings and convert to numpy for sklearn compatibility
      book_embeddings = model.encode(book_texts, convert_to_tensor=True)
      book_embeddings = book_embeddings.cpu().numpy()

      for idx, user in user_df.iterrows():
          # Combine user's profile and reviews
          user_text = user["profile"] + " " + " ".join(user["reviews"])

          # Get user embedding and convert to numpy
          user_embedding = model.encode([user_text], convert_to_tensor=True)
          user_embedding = user_embedding.cpu().numpy()

          # Calculate cosine similarity between user and all books
          similarities = cosine_similarity(user_embedding, book_embeddings)[0]

          # Get indices of top_k most similar books
          top_indices = similarities.argsort()[-top_k:][::-1]

          # Get the book IDs for recommendations
          recommended_book_ids = [book_df.iloc[i]["book_id"] for i in
  top_indices]

          recommendations[user["user_id"]] = recommended_book_ids

      return recommendations

# ---------------------
# STEP 5. Main Run
# ---------------------

def main():
    print("Generating synthetic datasets...")
    users = generate_users()
    books = generate_books()
    save_datasets(users, books)

    print("Loading sentence embedding model...")
    model = SentenceTransformer("all-MiniLM-L6-v2")

    print("Generating recommendations...\n")
    recommendations = recommend_books(users, books, model)

    print("Sample output:")
    for user_id, book_ids in list(recommendations.items())[:5]:
        print(f"User {user_id} → Recommended Books: {book_ids}")

if __name__ == "__main__":
    main()


Generating synthetic datasets...
Loading sentence embedding model...
Generating recommendations...

Sample output:
User u000 → Recommended Books: ['b096', 'b034', 'b015']
User u001 → Recommended Books: ['b028', 'b053', 'b044']
User u002 → Recommended Books: ['b096', 'b028', 'b070']
User u003 → Recommended Books: ['b050', 'b042', 'b038']
User u004 → Recommended Books: ['b099', 'b051', 'b047']
