In [None]:
import warnings
import random

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("hf_token")

In [None]:
from huggingface_hub import login

login(token=secret_value_0)

In [None]:
from datasets import load_dataset, Dataset
from pprint import pp

In [None]:
ds = load_dataset("liyucheng/goodreads", "books", split='train')
print(ds)

In [None]:
EXCLUDE_GENRES_EXACT = {
    "to-read", "currently-reading", "my-library", "books-i-own", "library-book",
    "ebook", "audiobook", "audio-book", "paperback", "hardback", "wish-list",
    "unfinished", "have", "my-books", "home", "books", "favorites", "default",
    "ficton", "bookclub", "book-club", "tbi-book-club", "book-group",
    "listened", "audio", "audio-books", "library", "gave-up"
}

EXCLUDE_STARTS_WITH = {"read-", "read-in-", "own"}

GENRE_NORMALIZATION = {
    "general": "fiction",
    "novel": "fiction",
    "novels": "fiction",
    "general-fiction": "fiction",
    "adult-fiction": "fiction",
    "historical": "historical-fiction",
    "history": "historical-fiction",
    "nonfiction": "non-fiction",
    "ya": "young-adult",
    "sci-fi": "science-fiction"
}

APPROVED_GENRES = {
    "fiction", "romance", "fantasy", "mystery", "historical-fiction",
    "young-adult", "literature", "adventure", "thriller",
    "science-fiction", "suspense", "paranormal", "crime",
    "drama", "humor", "contemporary", "contemporary-fiction",
    "contemporary-romance", "classics", "non-fiction", "adult"
}


def clean_genres_list(genres_list):
    cleaned = []
    for g in genres_list:
        g = g.strip().lower()

        if g in EXCLUDE_GENRES_EXACT:
            continue

        if any(g.startswith(prefix) for prefix in EXCLUDE_STARTS_WITH):
            continue

        unified = GENRE_NORMALIZATION.get(g, g)

        if unified in APPROVED_GENRES and unified not in cleaned:
            cleaned.append(unified)

    return ", ".join(cleaned) if cleaned else "unknown"


In [None]:
def prepare_goodreads(sample):
    authors = ", ".join(sample["author_names"]) if sample["author_names"] else "unknown author"
    genres = clean_genres_list(sample["genres"])
    genres = genres if genres else 'general'
    desc = sample['description'] if sample['description'] else "no description"

    try:
        average_rating = float(sample['average_rating'])
    except:
        average_rating = 3.0
        
    ratings_count = int(sample['ratings_count']) if sample['ratings_count'].isnumeric() else 0
    
    return {
        "author_names": authors,
        "genres": genres,
        "description": desc,
        "ratings_count": ratings_count,
        "average_rating": average_rating
    }

In [None]:
cols_to_remove = [
    'book_id',
    'isbn13',
    'isbn',
    'author_ids',
    'text_reviews_count',
    'publication_month',
    'publication_day',
    'publisher',
    'language_code',
    'num_pages',
    'format',
    'work_id',
    'original_title',
    'original_publication_year',
    'original_language_id'
]

In [None]:
import os

ds = ds.map(prepare_goodreads, remove_columns=cols_to_remove, num_proc=os.cpu_count())

In [None]:
print(ds.column_names)

In [None]:
pp(ds[0])

In [None]:
import re

def is_english(text):
    pattern = r'^[\u0000-\u024F\s.,!?\'\";:\-\(\)\[\]0-9]+$'
    return bool(re.fullmatch(pattern, text))

In [None]:
ds_fil = ds.filter(lambda ex: is_english(ex['title']))

In [None]:
def build_item_mappings(dataset):
    itemid_to_name = {}
    itemname_to_id = {}

    for ex in dataset:
        book_id = ex["book_id"]
        title = ex["title"]
        author = ex["author_names"]
        year = ex.get("publication_year", "Unknown")
        
        cleaned_genres = ex["genres"]
        avg_rating = ex.get("average_rating", "Unknown")
        ratings_count = ex.get("ratings_count", "Unknown")

        display_name = (f"{title} by {author}, published in {year}, Genres: {cleaned_genres}, "
                        f"Average Rating: {avg_rating}, Ratings Count: {ratings_count}")

        itemid_to_name[book_id] = display_name
        itemname_to_id[display_name] = book_id

    return itemid_to_name, itemname_to_id


In [None]:
# itemid_to_name, itemname_to_id = build_item_mappings(ds)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# # Setup GPT-2
# tokenizer = AutoTokenizer.from_pretrained('gpt2')
# model = AutoModelForCausalLM.from_pretrained('gpt2')
# model.eval()

In [None]:
user_requests = [
    "I prefer fantasy and science fiction with strong world-building.",
    # "I enjoy romance novels by classic authors.",
    "Give me books that are highly popular with many ratings.",
    "I like contemporary fiction focusing on relationships.",
    "Looking for diverse genres, not just fantasy.",
    "Show me books with high average ratings, regardless of genre.",
    "I want to explore lesser-known but highly rated books.",
    "List books published after 2000 that are popular.",
    "I like historical fiction with strong female characters.",
    "Show me books with genre diversity, avoiding duplicates.",
    "Recommend books that mix mystery and thriller elements.",
    # "Suggest short story collections by well-known authors.",
    "I'm interested in books that won major literary awards.",
    "List humorous novels that are light and easy to read.",
    "I want epic fantasy series with complex plots.",
    "Show me recent bestsellers from diverse authors.",
    "Give me novels that explore philosophical themes.",
    "Suggest debut novels that received critical acclaim.",
    "I like books that combine science fiction and romance.",
    "Show me highly rated books with minimal violence."
]

In [None]:
len(ds_fil)

In [None]:
# random_candidate_ids = random.sample(range(len(ds_fil)), k=5)
# candidates = []
# for cid in random_candidate_ids:
#     display_text = f"{ds_fil[cid]['title']} by {ds_fil[cid]['author_names']}"
#     # display_text = ds_fil[cid]['text']
#     candidates.append({"text": display_text})

# candidate_texts = [item['text'] for item in candidates]

# user_request = random.choice(user_requests)
# user_request = "Suggest popular science-fiction stories with historical settings."
# prompt_text = generate_gpt2_prompt(candidate_texts, user_request, top_n=5)
# print("=== Generated Prompt ===")
# print(prompt_text)

In [None]:
from transformers import pipeline

def run_pipeline_generation(prompt_text, generator, max_new_tokens=256, temperature=0.1, top_p=0.9, top_k=1):
    
    output = generator(
        prompt_text, 
        max_new_tokens=max_new_tokens, 
        temperature=temperature, 
        top_p=top_p, 
        top_k=top_k
    )
    
    generated_full_text = output[0]["generated_text"]
    generated_only = generated_full_text.strip()
    return generated_only

In [None]:
# model_name = "meta-llama/Llama-2-7b-chat-hf"

# model = AutoModelForCausalLM.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# # tokenizer.pad_token = tokenizer.eos_token

In [None]:
import random

user_request = random.choice(user_requests)
user_request

In [None]:
sample_size = 10
sample_indices = random.sample(range(len(ds_fil)), sample_size)
sample_ds = ds_fil.select(sample_indices).to_list()

In [None]:
from book_ranker import BookReRanker

ranker = BookReRanker()

In [None]:
user_request

In [None]:
ranked_books = ranker.rerank(user_request, sample_ds, top_k = len(sample_ds))
ranked_books = [book['title'] for book in ranked_books]
ranked_books

In [None]:
unranked_books = [book['title'] for book in sample_ds]
unranked_books

In [None]:
from typing import Optional, List, Dict, Union
import numpy as np

def run_ablation_study(
    reranker: BookReRanker,
    user_pref: str,
    book_batch: List[Dict],
    scenarios: Dict[str, tuple],
    top_k: int = None
) -> Dict[str, Dict[str, int]]:
    """
    Runs ablation study and returns final ranks as:
    results[Book Title][Scenario] = Final Rank

    Args:
        reranker: BookReRanker instance
        user_pref: User preference string
        book_batch: List of book dicts
        scenarios: Dict of scenario names to weight tuples
        top_k: Optional, limit to top_k ranked books per scenario

    Returns:
        Nested dictionary, ready for DataFrame creation
    """
    results = {}

    for scenario, weights in scenarios.items():
        print(f"Running scenario: {scenario} with weights {weights}")

        genre_scores = reranker._compute_genre_similarity(user_pref, book_batch)
        nll_scores = reranker._compute_nll_scores(user_pref, book_batch)
        rating_scores = reranker._calculate_bayesian_scores(book_batch)

        min_len = min(len(genre_scores), len(nll_scores), len(rating_scores))

        nll_norm = reranker._normalize_scores(nll_scores[:min_len])
        genre_norm = reranker._normalize_scores(genre_scores[:min_len], higher_better=True)
        rating_norm = reranker._normalize_scores(rating_scores[:min_len], higher_better=True)

        combined_scores = (
            weights[0] * nll_norm +
            weights[1] * genre_norm +
            weights[2] * rating_norm
        )

        final_ranks = np.argsort(-combined_scores) + 1

        for idx in range(min_len):
            book_title = book_batch[idx].get("title", f"Book {idx}")
            if book_title not in results:
                results[book_title] = {}
            results[book_title][scenario] = final_ranks[idx]

    return results




In [None]:
scenarios = {
    "Full System": (0.6, 0.3, 0.1),          # All components active  
    "No Genre Similarity": (0.6, 0.0, 0.4),  # Genre influence removed  
    "NLL Only": (1.0, 0.0, 0.0),             # Only NLL considered  
    "Genre Similarity Only": (0.0, 1.0, 0.0),# Only Genre Similarity  
    "Rating Only": (0.0, 0.0, 1.0)           # Only Ratings  
}

In [None]:
results = run_ablation_study(ranker, user_request, sample_ds, scenarios, top_k=len(sample_ds))

In [None]:
import pandas as pd

def get_dataframe_from_ranks_dict(final_ranks_dict):
    df_final_ranks = pd.DataFrame.from_dict(final_ranks_dict, orient="index")
    df_final_ranks.index.name = "Book Title"
    df_final_ranks = df_final_ranks.reset_index()
    return df_final_ranks.sort_values(by='Full System').reset_index(drop=True)

In [None]:
from IPython.display import display, Latex

print(f"User Preference: {user_request}\n")
df = get_dataframe_from_ranks_dict(results)
df