In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('cleaned_dataset_with_renamed_description.csv')

# Remove duplicate rows based on the 'title' column and keep the first occurrence
df_unique = df.drop_duplicates(subset='title', keep='first')

# Display the number of rows removed
num_duplicates = len(df) - len(df_unique)
print(f'Number of duplicate rows removed: {num_duplicates}')

# Save the DataFrame with duplicates removed to a new CSV file
df_unique.to_csv('cleaned_dataset_with_renamed_description.csv', index=False)

print('DataFrame with duplicates removed has been saved to "paraphrased_bookdataset_no_duplicates.csv"')
df = pd.read_csv('cleaned_dataset_with_renamed_description.csv')
print(df.shape)

In [None]:
# Necessary imports
import torch
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

# Function to preprocess and embed texts using Sentence Transformers
def preprocess_and_embed(texts):
    embeddings = model.encode(texts, convert_to_tensor=False, show_progress_bar=True, device=device)
    return embeddings

# Load data
df = pd.read_csv("cleaned_dataset_with_renamed_description.csv").dropna(subset=['title', 'description'])
combined_texts = df['title'] + " " + df['description']
book_embeddings = preprocess_and_embed(combined_texts.tolist())

# Function to recommend books based on user input
def recommend_books(user_input, book_embeddings, df, top_n=10):
    # Convert user input to embedding
    user_embedding = preprocess_and_embed([user_input])[0]  # Access the first (and only) embedding
    
    # Calculate cosine similarity between user input and book embeddings
    similarities = cosine_similarity([user_embedding], book_embeddings).flatten()
    
    # Get indices of top_n most similar books
    top_indices = np.argsort(similarities)[-top_n:][::-1]
    
    # Get the top recommended books
    recommended_books = df.iloc[top_indices]
    
    # Print the top recommended books and their cosine similarity values
    print(f"Top {top_n} recommended books:")
    for idx in range(top_n):
        book = recommended_books.iloc[idx]
        similarity = similarities[top_indices[idx]]
        print(f"Title: {book['title']}, Author: {book.get('author', 'Unknown')}, Genre: {book.get('genre', 'Unknown')}, Cosine Similarity: {similarity:.4f}")
    
    return recommended_books

# Get user input
user_input = input("Please describe the type of book you want to read: ")

# Recommend books based on user input
recommend_books(user_input, book_embeddings, df, top_n=20)


In [None]:
import pandas as pd

df_test=pd.read_csv('enhanced_test_data_n=2.csv')
print(df_test.columns)

In [7]:
# Necessary imports
import torch
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity

# Initialize the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

# Function to preprocess and embed texts using Sentence Transformers
def preprocess_and_embed(texts):
    embeddings = model.encode(texts, convert_to_tensor=False, show_progress_bar=False, device=device)
    return embeddings

# Load book data
df = pd.read_csv("cleaned_dataset_with_renamed_description.csv").dropna(subset=['title', 'description'])
combined_texts = df['title'] + " " + df['description']
book_embeddings = preprocess_and_embed(combined_texts.tolist())

# Load test data
test_df = pd.read_csv("enhanced_test_data_n=2.csv")
# Function to evaluate the recommendation system
def evaluate_recommendations(test_df, book_embeddings, df, top_n=20):
    # Convert the 'Recommended_Books' column from strings to sets of titles
    test_df['Recommended_Books'] = test_df['Recommended_Books'].apply(lambda x: set(x.split(', ')))

    # Initialize list to store accuracy
    accuracy_list = []

    # Loop through each row in the test dataframe
    for _, row in test_df.iterrows():
        # Get the combined description and true recommended books
        user_input = row['Combined_Description']
        true_books = row['Recommended_Books']

        # Get the user's embedding for the given combined description
        user_embedding = preprocess_and_embed([user_input])[0]  # Access the first (and only) embedding

        # Calculate cosine similarity between the user's embedding and book embeddings
        similarities = cosine_similarity([user_embedding], book_embeddings).flatten()

        # Get indices of the top_n most similar books
        top_indices = np.argsort(similarities)[-top_n:][::-1]

        # Get the top recommended books' titles
        predicted_books = set(df.iloc[top_indices]['title'])

        # Calculate accuracy by checking if true recommended books are present in the top_n predicted books
        correct_recommendations = true_books.intersection(predicted_books)
        accuracy = len(correct_recommendations) / len(true_books)

        # Append accuracy to the list
        accuracy_list.append(accuracy)

    # Calculate the average accuracy
    average_accuracy = np.mean(accuracy_list)

    # Print and return the average accuracy
    print(f"Average Accuracy: {average_accuracy:.4f}")
    return {
        "Average Accuracy": average_accuracy,
    }

print("for n=2")
# Evaluate the recommendation system using the test data
results = evaluate_recommendations(test_df, book_embeddings, df, top_n=10)




for n=2
Average Accuracy: 0.9535
