In [None]:
# Import necessary libraries
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add project root to path
project_root = Path('..').resolve()
sys.path.append(str(project_root))

# Import project modules
from src.data_processing import TextProcessor
from src.recommender import ContentBasedRecommender
from src.evaluation import ModelEvaluator

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("viridis")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)


In [None]:
# Load the dataset
# For this demo, we'll use the sample data from the previous related works
try:
    # Try to load the augmented data first
    data_path = project_root / 'data' / 'augmented_data.csv'
    if not data_path.exists():
        # If not available, use the sample data from previous related works
        data_path = project_root.parent / 'previous_related_works' / 'A1_Data_V2.csv'
    
    df = pd.read_csv(data_path)
    print(f"Loaded data from {data_path}")
    print(f"Dataset shape: {df.shape}")
except FileNotFoundError:
    print("Error: Could not find the dataset. Please run data collection first.")
    df = None

# Display the first few rows
if df is not None:
    df.head()


In [None]:
# Check for missing values
if df is not None:
    missing_values = df.isnull().sum()
    missing_percent = (missing_values / len(df)) * 100
    
    missing_df = pd.DataFrame({
        'Missing Values': missing_values,
        'Percentage': missing_percent
    })
    
    print("Missing values in each column:")
    display(missing_df[missing_df['Missing Values'] > 0].sort_values('Missing Values', ascending=False))


In [None]:
# Explore book types
if df is not None and 'type' in df.columns:
    # Count book types
    book_types = df['type'].dropna().str.split(', ').explode().value_counts()
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x=book_types.values, y=book_types.index)
    plt.title('Book Types Distribution')
    plt.xlabel('Count')
    plt.ylabel('Book Type')
    plt.tight_layout()
    plt.show()


In [None]:
# Process the data using our TextProcessor
if df is not None:
    processor = TextProcessor()
    df_processed = processor.preprocess(df)
    
    print(f"Processed data shape: {df_processed.shape}")
    print(f"New columns added: {set(df_processed.columns) - set(df.columns)}")
    
    # Display sample of processed data
    df_processed[['ISBN', 'title', 'author_processed', 'type_processed', 'start_year', 
                 'decade', 'recency_score', 'popularity_score', 'corpus']].head()


In [None]:
# Initialize and train the recommender
if df is not None and 'df_processed' in locals():
    recommender = ContentBasedRecommender()
    recommender.fit(df_processed)
    
    # Evaluate the model
    evaluator = ModelEvaluator()
    metrics = evaluator.evaluate(recommender, df_processed, test_size=0.2)
    
    # Display metrics
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")


In [None]:
# Function to display recommendations
def display_recommendations(recommendations):
    """Display recommendations in a formatted way"""
    for i, (_, book) in enumerate(recommendations.iterrows(), 1):
        print(f"{i}. {book['title']} (Similarity: {book['similarity_score']:.4f})")
        print(f"   Author: {book.get('author', 'Unknown')}")
        print(f"   Type: {book.get('type', 'Unknown')}")
        print(f"   ISBN: {book.get('ISBN', 'Unknown')}")
        print()

# Get recommendations for a sample book
if df is not None and 'df_processed' in locals() and 'recommender' in locals():
    # Find a book with a non-null title
    sample_book = df_processed[df_processed['title'].notna()].iloc[0]
    sample_title = sample_book['title']
    
    print(f"Generating recommendations for: {sample_title}")
    recommendations = recommender.recommend(sample_title, n=5)
    
    display_recommendations(recommendations)


In [None]:
# Try recommendations by ISBN
if df is not None and 'df_processed' in locals() and 'recommender' in locals():
    # Find a book with a non-null ISBN
    sample_isbn = df_processed[df_processed['ISBN'].notna()].iloc[0]['ISBN']
    
    print(f"Generating recommendations for ISBN: {sample_isbn}")
    try:
        recommendations = recommender.get_recommendations_by_isbn(sample_isbn, n=5)
        display_recommendations(recommendations)
    except ValueError as e:
        print(f"Error: {e}")
