In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr


In [10]:
books = pd.read_csv('datasets/books.csv')
books

Unnamed: 0,id,title,author,description,genres,avg_rating,num_ratings,tags
0,0,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,"['classics', 'fiction', 'historical fiction', ...",4.27,5691311,To Kill a Mockingbird Harper Lee Classics Fict...
1,1,Harry Potter and the Philosopher’s Stone (Harr...,J.K. Rowling,Harry Potter thinks he is an ordinary boy - un...,"['fantasy', 'fiction', 'young adult', 'magic',...",4.47,9278135,Harry Potter and the Philosopher’s Stone (Harr...
2,2,Pride and Prejudice,Jane Austen,"Since its immediate success in 1813, Pride and...","['classics', 'fiction', 'romance', 'historical...",4.28,3944155,Pride and Prejudice Jane Austen Classics Ficti...
3,3,The Diary of a Young Girl,Anne Frank,Discovered in the attic in which she spent the...,"['classics', 'nonfiction', 'history', 'biograp...",4.18,3488438,The Diary of a Young Girl Anne Frank Classics ...
4,4,Animal Farm,George Orwell,Librarian's note: There is an Alternate Cover ...,"['classics', 'fiction', 'dystopia', 'fantasy',...",3.98,3575172,Animal Farm George Orwell Classics Fiction Dys...
...,...,...,...,...,...,...,...,...
9918,9995,"Breeders (Breeders Trilogy, #1)",Ashley Quigley,How far would you go? If human society was gen...,"['dystopia', 'science fiction', 'post apocalyp...",3.44,276,"Breeders (Breeders Trilogy, #1) Ashley Quigley..."
9919,9996,Dynamo,Eleanor Gustafson,Jeth Cavanaugh is searching for a new life alo...,[],4.23,60,Dynamo Eleanor Gustafson Jeth Cavanaugh is se...
9920,9997,The Republic of Trees,Sam Taylor,This dark fable tells the story of four Englis...,"['fiction', 'horror', 'dystopia', 'coming of a...",3.29,383,The Republic of Trees Sam Taylor Fiction Horro...
9921,9998,"Waking Up (Healing Hearts, #1)",Renee Dyer,For Adriana Monroe life couldn’t get any bette...,"['new adult', 'romance', 'contemporary romance...",4.13,263,"Waking Up (Healing Hearts, #1) Renee Dyer New ..."


In [11]:
books.fillna('',inplace= True)

In [12]:
vector = TfidfVectorizer(stop_words='english')
vector_matrix = vector.fit_transform(books['tags'])

In [13]:
similarity = cosine_similarity(vector_matrix,vector_matrix)

In [14]:
def recommend(input, top_n=5):
    # Vectorize the input book title to transform it into the feature space of our trained vectorizer
    input_vect = vector.transform([input])

    # Calculate cosine similarity between the input vector and all books in the vector matrix
    similarity_score = cosine_similarity(input_vect, vector_matrix).flatten()

    # Create a DataFrame combining book info and similarity scores
    score_df = pd.DataFrame({
        'title': books['title'],
        'author': books['author'],
        'genres': books['genres'],
        'avg_rating': books['avg_rating'],
        'ratings_count': books['num_ratings'],
        'similarity_score': similarity_score
    })

    # Remove the book exactly matching the input title (to avoid recommending the same book)
    score_df = score_df[~score_df['title'].str.lower().eq(input.lower())]

    # Convert rating columns to numeric types and fill missing values with 0
    score_df['avg_rating'] = pd.to_numeric(score_df['avg_rating'], errors='coerce')
    score_df['ratings_count'] = pd.to_numeric(score_df['ratings_count'], errors='coerce')
    score_df.fillna(0, inplace=True)

    # Normalize ratings_count between 0 and 1 for fair weighting
    score_df['ratings_count'] = score_df['ratings_count'] / score_df['ratings_count'].max()

    # Group by title, author, and genres to merge duplicates and aggregate scores
    score_df = score_df.groupby(['title', 'author', 'genres'], as_index=False).agg({
        'avg_rating': 'mean',        # average rating if duplicates exist
        'ratings_count': 'sum',      # sum ratings counts (assuming counts add up)
        'similarity_score': 'mean'   # average similarity score
    })

    # === Step 1: Extract genres for the input book ===
    try:
        input_genres_raw = books[books['title'].str.lower() == input.lower()]['genres'].iloc[0]
        # Handle case where genres are stored as comma-separated string
        input_genres = [g.strip().lower() for g in input_genres_raw.split(',')] if isinstance(input_genres_raw, str) else []
    except:
        input_genres = []

    # === Step 2: Define fictionality keyword sets ===
    nonfiction_keywords = {'nonfiction', 'non-fiction', 'biography', 'memoir', 'self-help', 'education', 'how to', 'autobiography'}
    fiction_keywords = {'fiction', 'novel', 'fantasy', 'romance', 'thriller', 'mystery', 'young adult', 'ya', 'sci-fi', 'drama'}

    # Helper function to classify genres as fiction, nonfiction, or neutral
    def get_fictionality(genres):
        genres_lower = [g.strip().lower() for g in genres]
        if any(g in nonfiction_keywords for g in genres_lower):
            return 'nonfiction'
        if any(g in fiction_keywords for g in genres_lower):
            return 'fiction'
        return 'neutral'

    # Determine the fictionality of the input book
    input_fictionality = get_fictionality(input_genres)

    # === Step 3: Filter books by matching fictionality ===
    def filter_by_fictionality(book_genres):
        if not isinstance(book_genres, str):
            return False  # skip if genres are missing or not a string
        genres = [g.strip().lower() for g in book_genres.split(',')]
        return get_fictionality(genres) == input_fictionality

    # Apply the filter only if input book is classified as fiction or nonfiction
    if input_fictionality in ['fiction', 'nonfiction']:
        score_df = score_df[score_df['genres'].apply(filter_by_fictionality)]

    # === Final Step: Calculate the final combined score ===
    score_df['final_score'] = (
        (score_df['similarity_score'] * 0.5) +              # similarity weighted 50%
        (score_df['avg_rating'] / 5 * 0.3) +                # average rating (out of 5) weighted 30%
        (score_df['ratings_count'] * 0.2)                    # normalized ratings count weighted 20%
    )

    # Sort by final score descending and return the top N books with relevant columns
    top_books = score_df.sort_values('final_score', ascending=False).head(top_n)

    return top_books[['title', 'author', 'genres', 'avg_rating']]


In [15]:
def recommend_interface(user_query):
    user_query = user_query.strip()
    if not user_query:
        return "Please enter a book title or preference."
    
    try:
        results = recommend(user_query)
        if results.empty:
            return "No recommendations found."
        
        # Format output string
        output_lines = [f"📚 Top {len(results)} book recommendations for: \"{user_query}\"\n"]
        for idx, row in results.iterrows():
            output_lines.append(f"{idx + 1}. {row['title']} by {row['author']}")
            output_lines.append(f"   Genre(s): {row['genres']}")
            output_lines.append(f"   ⭐ Avg Rating: {row['avg_rating']}\n")
        
        return "\n".join(output_lines)

    except Exception as e:
        return f"❌ Error occurred: {str(e)}"

In [16]:
iface = gr.Interface(
    fn=recommend_interface,
    inputs=gr.Textbox(lines=1, placeholder="Type your book preference here..."),
    outputs="text",
    title="📚 Book Recommendation System"
)

iface.launch(inline=True, share=True)


* Running on local URL:  http://127.0.0.1:7861

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


