In [None]:
from transformers import DistilBertModel, DistilBertTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_csv("books.csv")

In [None]:

# Load pre-trained DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')


In [None]:
# Function to get embeddings for text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
    return embeddings

In [None]:


# Process book data
book_embeddings = np.array([get_embeddings(title + ' ' + description) for title, description in zip(df['title'].astype(str), df['description'].astype(str))])

# User input


In [None]:
user_input = "gravity"  # Replace with the actual user input
user_embedding = get_embeddings(user_input)

# Calculate cosine similarity between user input and book embeddings
user_embedding = user_embedding.reshape(1, -1)  # Reshape to 2D array
similarities = cosine_similarity(user_embedding, book_embeddings)

# Print top 10 similarities along with corresponding books
print("Top 10 Similarities:")
top_indices = similarities.argsort()[0][-10:][::-1]
for i, idx in enumerate(top_indices):
    similarity_score = similarities[0, idx]
    book_title = df.iloc[idx]['title']
    book_description = df.iloc[idx]['description']
    
    print(f"{i + 1}. Similarity: {similarity_score:.4f} - Title: {book_title}")

# Display top N recommendations
top_recommendations = df.iloc[top_indices]
print(top_recommendations[['title']])
