In [1]:
!pip install torch torchaudio torchvision sentence-transformers
!pip install sentence-transformers faiss-gpu  # or faiss-cpu if GPU is not available

Collecting torch
  Downloading torch-2.3.0-cp38-cp38-manylinux1_x86_64.whl (779.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0m
[?25hCollecting torchaudio
  Downloading torchaudio-2.3.0-cp38-cp38-manylinux1_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting torchvision
  Downloading torchvision-0.18.0-cp38-cp38-manylinux1_x86_64.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105
  Downloading nvidia_cuda_

Installing collected packages: mpmath, typing-extensions, triton, sympy, safetensors, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, fsspec, nvidia-cusparse-cu12, nvidia-cudnn-cu12, huggingface-hub, tokenizers, nvidia-cusolver-cu12, transformers, torch, torchvision, torchaudio, sentence-transformers
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.3.0
    Uninstalling typing_extensions-4.3.0:
      Successfully uninstalled typing_extensions-4.3.0
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2021.7.0
    Uninstalling fsspec-2021.7.0:
      Successfully uninstalled fsspec-2021.7.0
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.0.12
    Uninstalling huggingface-hub-0.0.12:
      Successfully uninstalled huggingface-hub-0.0.12
  Atte

Installing collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
[0m--- Logging error ---
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/utils/logging.py", line 177, in emit
    self.console.print(renderable, overflow="ignore", crop=False, style=style)
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/console.py", line 1673, in print
    extend(render(renderable, render_options))
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/console.py", line 1305, in render
    for render_output in iter_render:
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/utils/logging.py", line 134, in __rich_console__
    for line in lines:
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/segment.py", line 249, in split_lines
    for segment in segments:
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/console.py", line 1283, in render
    renderable = rich_cast(renderable)
 

In [None]:
# Necessary imports
import torch
import pandas as pd
import numpy as np
import time
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Initialize the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Custom Dataset class for handling triplet data
class TripletDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __getitem__(self, idx):
        data = self.dataframe.iloc[idx]
        return {
            'anchor': data['anchor'],
            'positive': data['positive'],
            'negative': data['negative']
        }

    def __len__(self):
        return len(self.dataframe)

# Load the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

# Triplet Loss Model definition
class TripletLossModel(nn.Module):
    def __init__(self):
        super(TripletLossModel, self).__init__()
        self.embedding_model = model

    def forward(self, anchor, positive, negative):
        anchor_emb = self.embedding_model.encode(anchor, convert_to_tensor=True, device=device)
        positive_emb = self.embedding_model.encode(positive, convert_to_tensor=True, device=device)
        negative_emb = self.embedding_model.encode(negative, convert_to_tensor=True, device=device)
        return anchor_emb, positive_emb, negative_emb

# Load dataset and prepare DataLoader
df_triplets = pd.read_csv("triplet_data.csv")
triplet_dataset = TripletDataset(df_triplets)
triplet_loader = DataLoader(triplet_dataset, batch_size=10, shuffle=True, num_workers=5)

# Model, Loss, and Optimizer setup
triplet_model = TripletLossModel().to(device)
loss_function = nn.TripletMarginLoss(margin=1.0)
optimizer = optim.Adam(triplet_model.parameters(), lr=0.01)




In [None]:
# Function to preprocess and embed texts using Sentence Transformers
def preprocess_and_embed(texts):
    embeddings = model.encode(texts, convert_to_tensor=False, show_progress_bar=True, device=device)
    return embeddings

# Load data for evaluation
df = pd.read_csv("cleaned_dataset_with_renamed_description.csv").dropna(subset=['title', 'description'])
combined_texts = df['title'] + " " + df['description']
book_embeddings = preprocess_and_embed(combined_texts.tolist())

# Load test data
test_df = pd.read_csv("enhanced_test_data_n=4.csv")
# Function to evaluate the recommendation system
def evaluate_recommendations(test_df, book_embeddings, df, top_n=50):
    # Convert the 'Recommended_Books' column from strings to sets of titles
    test_df['Recommended_Books'] = test_df['Recommended_Books'].apply(lambda x: set(x.split(', ')))

    # Initialize list to store accuracy
    accuracy_list = []

    # Loop through each row in the test dataframe
    for _, row in test_df.iterrows():
        # Get the combined description and true recommended books
        user_input = row['Combined_Description']
        true_books = row['Recommended_Books']

        # Get the user's embedding for the given combined description
        user_embedding = preprocess_and_embed([user_input])[0]  # Access the first (and only) embedding

        # Calculate cosine similarity between the user's embedding and book embeddings
        similarities = cosine_similarity([user_embedding], book_embeddings).flatten()

        # Get indices of the top_n most similar books
        top_indices = np.argsort(similarities)[-top_n:][::-1]

        # Get the top recommended books' titles
        predicted_books = set(df.iloc[top_indices]['title'])

        # Calculate accuracy by checking if true recommended books are present in the top_n predicted books
        correct_recommendations = true_books.intersection(predicted_books)
        accuracy = len(correct_recommendations) / len(true_books)

        # Append accuracy to the list
        accuracy_list.append(accuracy)

    # Calculate the average accuracy
    average_accuracy = np.mean(accuracy_list)

    # Print and return the average accuracy
    print(f"Average Accuracy: {average_accuracy:.4f}")
    return {
        "Average Accuracy": average_accuracy,
    }

print("for n=4")
# Evaluate the recommendation system using the test data
results = evaluate_recommendations(test_df, book_embeddings, df, top_n=50)

In [None]:
# Function to preprocess and embed texts using Sentence Transformers
def preprocess_and_embed(texts):
    embeddings = model.encode(texts, convert_to_tensor=False, show_progress_bar=True, device=device)
    return embeddings

# Load data for evaluation
df = pd.read_csv("cleaned_dataset_with_renamed_description.csv").dropna(subset=['title', 'description'])
combined_texts = df['title'] + " " + df['description']
book_embeddings = preprocess_and_embed(combined_texts.tolist())

# Load test data
test_df = pd.read_csv("enhanced_test_data_n=4.csv")
# Function to evaluate the recommendation system
def evaluate_recommendations(test_df, book_embeddings, df, top_n=40):
    # Convert the 'Recommended_Books' column from strings to sets of titles
    test_df['Recommended_Books'] = test_df['Recommended_Books'].apply(lambda x: set(x.split(', ')))

    # Initialize list to store accuracy
    accuracy_list = []

    # Loop through each row in the test dataframe
    for _, row in test_df.iterrows():
        # Get the combined description and true recommended books
        user_input = row['Combined_Description']
        true_books = row['Recommended_Books']

        # Get the user's embedding for the given combined description
        user_embedding = preprocess_and_embed([user_input])[0]  # Access the first (and only) embedding

        # Calculate cosine similarity between the user's embedding and book embeddings
        similarities = cosine_similarity([user_embedding], book_embeddings).flatten()

        # Get indices of the top_n most similar books
        top_indices = np.argsort(similarities)[-top_n:][::-1]

        # Get the top recommended books' titles
        predicted_books = set(df.iloc[top_indices]['title'])

        # Calculate accuracy by checking if true recommended books are present in the top_n predicted books
        correct_recommendations = true_books.intersection(predicted_books)
        accuracy = len(correct_recommendations) / len(true_books)

        # Append accuracy to the list
        accuracy_list.append(accuracy)

    # Calculate the average accuracy
    average_accuracy = np.mean(accuracy_list)

    # Print and return the average accuracy
    print(f"Average Accuracy: {average_accuracy:.4f}")
    return {
        "Average Accuracy": average_accuracy,
    }

print("for n=4")
# Evaluate the recommendation system using the test data
results = evaluate_recommendations(test_df, book_embeddings, df, top_n=40)

In [None]:
# Function to preprocess and embed texts using Sentence Transformers
def preprocess_and_embed(texts):
    embeddings = model.encode(texts, convert_to_tensor=False, show_progress_bar=True, device=device)
    return embeddings

# Load data for evaluation
df = pd.read_csv("cleaned_dataset_with_renamed_description.csv").dropna(subset=['title', 'description'])
combined_texts = df['title'] + " " + df['description']
book_embeddings = preprocess_and_embed(combined_texts.tolist())

# Load test data
test_df = pd.read_csv("enhanced_test_data_n=4.csv")
# Function to evaluate the recommendation system
def evaluate_recommendations(test_df, book_embeddings, df, top_n=60):
    # Convert the 'Recommended_Books' column from strings to sets of titles
    test_df['Recommended_Books'] = test_df['Recommended_Books'].apply(lambda x: set(x.split(', ')))

    # Initialize list to store accuracy
    accuracy_list = []

    # Loop through each row in the test dataframe
    for _, row in test_df.iterrows():
        # Get the combined description and true recommended books
        user_input = row['Combined_Description']
        true_books = row['Recommended_Books']

        # Get the user's embedding for the given combined description
        user_embedding = preprocess_and_embed([user_input])[0]  # Access the first (and only) embedding

        # Calculate cosine similarity between the user's embedding and book embeddings
        similarities = cosine_similarity([user_embedding], book_embeddings).flatten()

        # Get indices of the top_n most similar books
        top_indices = np.argsort(similarities)[-top_n:][::-1]

        # Get the top recommended books' titles
        predicted_books = set(df.iloc[top_indices]['title'])

        # Calculate accuracy by checking if true recommended books are present in the top_n predicted books
        correct_recommendations = true_books.intersection(predicted_books)
        accuracy = len(correct_recommendations) / len(true_books)

        # Append accuracy to the list
        accuracy_list.append(accuracy)

    # Calculate the average accuracy
    average_accuracy = np.mean(accuracy_list)

    # Print and return the average accuracy
    print(f"Average Accuracy: {average_accuracy:.4f}")
    return {
        "Average Accuracy": average_accuracy,
    }

print("for n=4")
# Evaluate the recommendation system using the test data
results = evaluate_recommendations(test_df, book_embeddings, df, top_n=60)

In [None]:
# Function to preprocess and embed texts using Sentence Transformers
def preprocess_and_embed(texts):
    embeddings = model.encode(texts, convert_to_tensor=False, show_progress_bar=True, device=device)
    return embeddings

# Load data for evaluation
df = pd.read_csv("cleaned_dataset_with_renamed_description.csv").dropna(subset=['title', 'description'])
combined_texts = df['title'] + " " + df['description']
book_embeddings = preprocess_and_embed(combined_texts.tolist())

# Load test data
test_df = pd.read_csv("enhanced_test_data_n=4.csv")
# Function to evaluate the recommendation system
def evaluate_recommendations(test_df, book_embeddings, df, top_n=10):
    # Convert the 'Recommended_Books' column from strings to sets of titles
    test_df['Recommended_Books'] = test_df['Recommended_Books'].apply(lambda x: set(x.split(', ')))

    # Initialize list to store accuracy
    accuracy_list = []

    # Loop through each row in the test dataframe
    for _, row in test_df.iterrows():
        # Get the combined description and true recommended books
        user_input = row['Combined_Description']
        true_books = row['Recommended_Books']

        # Get the user's embedding for the given combined description
        user_embedding = preprocess_and_embed([user_input])[0]  # Access the first (and only) embedding

        # Calculate cosine similarity between the user's embedding and book embeddings
        similarities = cosine_similarity([user_embedding], book_embeddings).flatten()

        # Get indices of the top_n most similar books
        top_indices = np.argsort(similarities)[-top_n:][::-1]

        # Get the top recommended books' titles
        predicted_books = set(df.iloc[top_indices]['title'])

        # Calculate accuracy by checking if true recommended books are present in the top_n predicted books
        correct_recommendations = true_books.intersection(predicted_books)
        accuracy = len(correct_recommendations) / len(true_books)

        # Append accuracy to the list
        accuracy_list.append(accuracy)

    # Calculate the average accuracy
    average_accuracy = np.mean(accuracy_list)

    # Print and return the average accuracy
    print(f"Average Accuracy: {average_accuracy:.4f}")
    return {
        "Average Accuracy": average_accuracy,
    }

print("for n=4")
# Evaluate the recommendation system using the test data
results = evaluate_recommendations(test_df, book_embeddings, df, top_n=10)

In [None]:
# Function to preprocess and embed texts using Sentence Transformers
def preprocess_and_embed(texts):
    embeddings = model.encode(texts, convert_to_tensor=False, show_progress_bar=True, device=device)
    return embeddings

# Load data for evaluation
df = pd.read_csv("cleaned_dataset_with_renamed_description.csv").dropna(subset=['title', 'description'])
combined_texts = df['title'] + " " + df['description']
book_embeddings = preprocess_and_embed(combined_texts.tolist())

# Load test data
test_df = pd.read_csv("enhanced_test_data_n=4.csv")
# Function to evaluate the recommendation system
def evaluate_recommendations(test_df, book_embeddings, df, top_n=20):
    # Convert the 'Recommended_Books' column from strings to sets of titles
    test_df['Recommended_Books'] = test_df['Recommended_Books'].apply(lambda x: set(x.split(', ')))

    # Initialize list to store accuracy
    accuracy_list = []

    # Loop through each row in the test dataframe
    for _, row in test_df.iterrows():
        # Get the combined description and true recommended books
        user_input = row['Combined_Description']
        true_books = row['Recommended_Books']

        # Get the user's embedding for the given combined description
        user_embedding = preprocess_and_embed([user_input])[0]  # Access the first (and only) embedding

        # Calculate cosine similarity between the user's embedding and book embeddings
        similarities = cosine_similarity([user_embedding], book_embeddings).flatten()

        # Get indices of the top_n most similar books
        top_indices = np.argsort(similarities)[-top_n:][::-1]

        # Get the top recommended books' titles
        predicted_books = set(df.iloc[top_indices]['title'])

        # Calculate accuracy by checking if true recommended books are present in the top_n predicted books
        correct_recommendations = true_books.intersection(predicted_books)
        accuracy = len(correct_recommendations) / len(true_books)

        # Append accuracy to the list
        accuracy_list.append(accuracy)

    # Calculate the average accuracy
    average_accuracy = np.mean(accuracy_list)

    # Print and return the average accuracy
    print(f"Average Accuracy: {average_accuracy:.4f}")
    return {
        "Average Accuracy": average_accuracy,
    }

print("for n=4")
# Evaluate the recommendation system using the test data
results = evaluate_recommendations(test_df, book_embeddings, df, top_n=20)

In [None]:
# Function to preprocess and embed texts using Sentence Transformers
def preprocess_and_embed(texts):
    embeddings = model.encode(texts, convert_to_tensor=False, show_progress_bar=True, device=device)
    return embeddings

# Load data for evaluation
df = pd.read_csv("cleaned_dataset_with_renamed_description.csv").dropna(subset=['title', 'description'])
combined_texts = df['title'] + " " + df['description']
book_embeddings = preprocess_and_embed(combined_texts.tolist())

# Load test data
test_df = pd.read_csv("enhanced_test_data_n=4.csv")
# Function to evaluate the recommendation system
def evaluate_recommendations(test_df, book_embeddings, df, top_n=30):
    # Convert the 'Recommended_Books' column from strings to sets of titles
    test_df['Recommended_Books'] = test_df['Recommended_Books'].apply(lambda x: set(x.split(', ')))

    # Initialize list to store accuracy
    accuracy_list = []

    # Loop through each row in the test dataframe
    for _, row in test_df.iterrows():
        # Get the combined description and true recommended books
        user_input = row['Combined_Description']
        true_books = row['Recommended_Books']

        # Get the user's embedding for the given combined description
        user_embedding = preprocess_and_embed([user_input])[0]  # Access the first (and only) embedding

        # Calculate cosine similarity between the user's embedding and book embeddings
        similarities = cosine_similarity([user_embedding], book_embeddings).flatten()

        # Get indices of the top_n most similar books
        top_indices = np.argsort(similarities)[-top_n:][::-1]

        # Get the top recommended books' titles
        predicted_books = set(df.iloc[top_indices]['title'])

        # Calculate accuracy by checking if true recommended books are present in the top_n predicted books
        correct_recommendations = true_books.intersection(predicted_books)
        accuracy = len(correct_recommendations) / len(true_books)

        # Append accuracy to the list
        accuracy_list.append(accuracy)

    # Calculate the average accuracy
    average_accuracy = np.mean(accuracy_list)

    # Print and return the average accuracy
    print(f"Average Accuracy: {average_accuracy:.4f}")
    return {
        "Average Accuracy": average_accuracy,
    }

print("for n=4")
# Evaluate the recommendation system using the test data
results = evaluate_recommendations(test_df, book_embeddings, df, top_n=30)