In [1]:
!pip install torch torchaudio torchvision

Collecting torch
  Downloading torch-2.3.0-cp38-cp38-manylinux1_x86_64.whl (779.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m871.4 kB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0m
[?25hCollecting torchaudio
  Downloading torchaudio-2.3.0-cp38-cp38-manylinux1_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torchvision
  Downloading torchvision-0.18.0-cp38-cp38-manylinux1_x86_64.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting nvidia-nccl-cu12==2.20.5
  Downloading nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.2/176.2 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-nvtx-cu12==12.1.10

In [2]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors
import pandas as pd

# Initialize model and tokenizer once
def load_model_tokenizer():
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertModel.from_pretrained('distilbert-base-uncased')
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    return tokenizer, model, device

tokenizer, model, device = load_model_tokenizer()

# Load and prepare data
df = pd.read_csv("cleaned_dataset_with_renamed_description.csv")
df['title'] = df['title'].str.strip().replace(',', ' ', regex=True)
df['description'] = df['description'].str.strip()
df.reset_index(drop=True, inplace=True)

# Function to get embeddings
def get_embeddings(texts, batch_size=32):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True).to(device)
            outputs = model(**inputs)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu())
    embeddings = torch.cat(embeddings)
    return normalize(embeddings.numpy())

# Generate embeddings
texts = (df['title'] + ' ' + df['description']).tolist()
book_embeddings = get_embeddings(texts)

# Iterate over different values of n (2, 3, 4, 5, 6, and 7)
for n in range(6, 8):
    ann_model = NearestNeighbors(n_neighbors=n, metric='cosine')
    ann_model.fit(book_embeddings)
    
    recommendations = []
    
    # Loop through each description in the dataset
    for idx, description in enumerate(df['description']):
        # Get embeddings for the current description
        query_embedding = get_embeddings([description])
        
        # Get nearest neighbors
        distances, indices = ann_model.kneighbors(query_embedding)
        
        # Combine descriptions and get titles of recommended books
        combined_description = ' '.join(df.iloc[indices[0]]['description'])
        recommended_books = ', '.join(df.iloc[indices[0]]['title'])
        
        # Append the result to the list of recommendations
        recommendations.append({'Combined_Description': combined_description, 'Recommended_Books': recommended_books})
    
    # Create a DataFrame from recommendations
    test_data = pd.DataFrame(recommendations)
    
    # Save the test data to a CSV file with the value of n in the filename
    filename = f'enhanced_test_data_n={n}.csv'
    test_data.to_csv(filename, index=False)
    
    print(f'Saved test data for n={n} to {filename}')


  from .autonotebook import tqdm as notebook_tqdm
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 232k/232k [00:00<00:00, 462kB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 28.0/28.0 [00:00<00:00, 16.4kB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 466k/466k [00:00<00:00, 710kB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 483/483 [00:00<00:00, 229kB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 268M/268M [00:25<00:00, 10.7MB/s]
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transfo

Saved test data for n=6 to enhanced_test_data_n=6.csv
Saved test data for n=7 to enhanced_test_data_n=7.csv


In [None]:
!pip install torch torchvision torchaudio
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.preprocessing import normalize
import pandas as pd

# Initialize model and tokenizer once
def load_model_tokenizer():
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertModel.from_pretrained('distilbert-base-uncased')
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    return tokenizer, model, device

tokenizer, model, device = load_model_tokenizer()

# Data preparation
df = pd.read_csv("cleaned_dataset_with_renamed_description.csv")
df['title'] = df['title'].str.strip().replace(',', ' ', regex=True)
df['description'] = df['description'].str.strip()
df.reset_index(drop=True, inplace=True)

# Get embeddings
def get_embeddings(texts, batch_size=32):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True).to(device)
            outputs = model(**inputs)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu())
    embeddings = torch.cat(embeddings)
    return normalize(embeddings.numpy())

# Generate embeddings
texts = (df['title'] + ' ' + df['description']).tolist()
book_embeddings = get_embeddings(texts)

# Nearest neighbors for recommendations
from sklearn.neighbors import NearestNeighbors
ann_model = NearestNeighbors(n_neighbors=3, metric='cosine')
ann_model.fit(book_embeddings)

# Recommendation logic
recommendations = []
for idx, description in enumerate(df['description']):
    query_embedding = get_embeddings([description])
    distances, indices = ann_model.kneighbors(query_embedding)
    combined_description = ' '.join(df.iloc[indices[0]]['description'])
    recommended_books = ', '.join(df.iloc[indices[0]]['title'])
    recommendations.append({'Combined_Description': combined_description, 'Recommended_Books': recommended_books})

# Save results
test_data = pd.DataFrame(recommendations)
test_data.to_csv('enhanced_test_data_n=3.csv', index=False)


In [None]:
!pip install torch torchvision torchaudio
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.preprocessing import normalize
import pandas as pd

# Initialize model and tokenizer once
def load_model_tokenizer():
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertModel.from_pretrained('distilbert-base-uncased')
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    return tokenizer, model, device

tokenizer, model, device = load_model_tokenizer()

# Data preparation
df = pd.read_csv("cleaned_dataset_with_renamed_description.csv")
df['title'] = df['title'].str.strip().replace(',', ' ', regex=True)
df['description'] = df['description'].str.strip()
df.reset_index(drop=True, inplace=True)

# Get embeddings
def get_embeddings(texts, batch_size=32):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True).to(device)
            outputs = model(**inputs)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu())
    embeddings = torch.cat(embeddings)
    return normalize(embeddings.numpy())

# Generate embeddings
texts = (df['title'] + ' ' + df['description']).tolist()
book_embeddings = get_embeddings(texts)

# Nearest neighbors for recommendations
from sklearn.neighbors import NearestNeighbors
ann_model = NearestNeighbors(n_neighbors=4, metric='cosine')
ann_model.fit(book_embeddings)

# Recommendation logic
recommendations = []
for idx, description in enumerate(df['description']):
    query_embedding = get_embeddings([description])
    distances, indices = ann_model.kneighbors(query_embedding)
    combined_description = ' '.join(df.iloc[indices[0]]['description'])
    recommended_books = ', '.join(df.iloc[indices[0]]['title'])
    recommendations.append({'Combined_Description': combined_description, 'Recommended_Books': recommended_books})

# Save results
test_data = pd.DataFrame(recommendations)
test_data.to_csv('enhanced_test_data_n=4.csv', index=False)


In [None]:
!pip install torch torchvision torchaudio
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.preprocessing import normalize
import pandas as pd

# Initialize model and tokenizer once
def load_model_tokenizer():
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertModel.from_pretrained('distilbert-base-uncased')
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    return tokenizer, model, device

tokenizer, model, device = load_model_tokenizer()

# Data preparation
df = pd.read_csv("cleaned_dataset_with_renamed_description.csv")
df['title'] = df['title'].str.strip().replace(',', ' ', regex=True)
df['description'] = df['description'].str.strip()
df.reset_index(drop=True, inplace=True)

# Get embeddings
def get_embeddings(texts, batch_size=32):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True).to(device)
            outputs = model(**inputs)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu())
    embeddings = torch.cat(embeddings)
    return normalize(embeddings.numpy())

# Generate embeddings
texts = (df['title'] + ' ' + df['description']).tolist()
book_embeddings = get_embeddings(texts)

# Nearest neighbors for recommendations
from sklearn.neighbors import NearestNeighbors
ann_model = NearestNeighbors(n_neighbors=5, metric='cosine')
ann_model.fit(book_embeddings)

# Recommendation logic
recommendations = []
for idx, description in enumerate(df['description']):
    query_embedding = get_embeddings([description])
    distances, indices = ann_model.kneighbors(query_embedding)
    combined_description = ' '.join(df.iloc[indices[0]]['description'])
    recommended_books = ', '.join(df.iloc[indices[0]]['title'])
    recommendations.append({'Combined_Description': combined_description, 'Recommended_Books': recommended_books})

# Save results
test_data = pd.DataFrame(recommendations)
test_data.to_csv('enhanced_test_data_5.csv', index=False)


In [None]:
!pip install torch torchvision torchaudio
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.preprocessing import normalize
import pandas as pd

# Initialize model and tokenizer once
def load_model_tokenizer():
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertModel.from_pretrained('distilbert-base-uncased')
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    return tokenizer, model, device

tokenizer, model, device = load_model_tokenizer()

# Data preparation
df = pd.read_csv("cleaned_dataset_with_renamed_description.csv")
df['title'] = df['title'].str.strip().replace(',', ' ', regex=True)
df['description'] = df['description'].str.strip()
df.reset_index(drop=True, inplace=True)

# Get embeddings
def get_embeddings(texts, batch_size=32):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True).to(device)
            outputs = model(**inputs)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu())
    embeddings = torch.cat(embeddings)
    return normalize(embeddings.numpy())

# Generate embeddings
texts = (df['title'] + ' ' + df['description']).tolist()
book_embeddings = get_embeddings(texts)

# Nearest neighbors for recommendations
from sklearn.neighbors import NearestNeighbors
ann_model = NearestNeighbors(n_neighbors=6, metric='cosine')
ann_model.fit(book_embeddings)

# Recommendation logic
recommendations = []
for idx, description in enumerate(df['description']):
    query_embedding = get_embeddings([description])
    distances, indices = ann_model.kneighbors(query_embedding)
    combined_description = ' '.join(df.iloc[indices[0]]['description'])
    recommended_books = ', '.join(df.iloc[indices[0]]['title'])
    recommendations.append({'Combined_Description': combined_description, 'Recommended_Books': recommended_books})

# Save results
test_data = pd.DataFrame(recommendations)
test_data.to_csv('enhanced_test_data_n=6.csv', index=False)


In [None]:
!pip install torch torchvision torchaudio
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.preprocessing import normalize
import pandas as pd

# Initialize model and tokenizer once
def load_model_tokenizer():
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertModel.from_pretrained('distilbert-base-uncased')
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    return tokenizer, model, device

tokenizer, model, device = load_model_tokenizer()

# Data preparation
df = pd.read_csv("cleaned_dataset_with_renamed_description.csv")
df['title'] = df['title'].str.strip().replace(',', ' ', regex=True)
df['description'] = df['description'].str.strip()
df.reset_index(drop=True, inplace=True)

# Get embeddings
def get_embeddings(texts, batch_size=32):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True).to(device)
            outputs = model(**inputs)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu())
    embeddings = torch.cat(embeddings)
    return normalize(embeddings.numpy())

# Generate embeddings
texts = (df['title'] + ' ' + df['description']).tolist()
book_embeddings = get_embeddings(texts)

# Nearest neighbors for recommendations
from sklearn.neighbors import NearestNeighbors
ann_model = NearestNeighbors(n_neighbors=7, metric='cosine')
ann_model.fit(book_embeddings)

# Recommendation logic
recommendations = []
for idx, description in enumerate(df['description']):
    query_embedding = get_embeddings([description])
    distances, indices = ann_model.kneighbors(query_embedding)
    combined_description = ' '.join(df.iloc[indices[0]]['description'])
    recommended_books = ', '.join(df.iloc[indices[0]]['title'])
    recommendations.append({'Combined_Description': combined_description, 'Recommended_Books': recommended_books})

# Save results
test_data = pd.DataFrame(recommendations)
test_data.to_csv('enhanced_test_data_n=7.csv', index=False)


In [2]:
import pandas as pd
df=pd.read_csv('enhanced_test_data_n=4.csv')
df.head()

Unnamed: 0,Combined_Description,Recommended_Books
0,"The book, Gilead, has been eagerly awaited by ...","Gilead, Song of Solomon, C.S. Lewis, Lady on t..."
1,Agatha Christie's final play novelisation is a...,"Spider's Web, The Admirable Crichton ; Peter P..."
2,The second volume of Stephen Donaldson's accla...,"The One Tree, The Fellowship of the Ring, In t..."
3,The Dark Arena of organized crime and its flas...,"Rage of angels, Exit Strategy, The Laws of Our..."
4,Lewis' study on the nature of love categorized...,"The Four Loves, The Problem of Pain, Selected ..."


In [4]:
df=pd.read_csv('triplet_data.csv')
df.head()

Unnamed: 0,anchor,positive,negative
0,"The book, Gilead, has been eagerly awaited by ...","The world-renowned writer of Patrick Bateman, ...",Outlines the process of building Web-based app...
1,Agatha Christie's final play novelisation is a...,The New York Times and USA TODAY both praise t...,"Provides current, comprehensive coverage that ..."
2,The second volume of Stephen Donaldson's accla...,"Tolkien's timeless fantasy, which centers on t...","Contains general instructions for fitting, tai..."
3,The Dark Arena of organized crime and its flas...,"In this stunning sequel to ""Dead I Well May Be...",The Maple Syrup Cookbook 8-Copy Display is ava...
4,Lewis' study on the nature of love categorized...,The Puritan tradition's character and life wer...,The 1967 edition of the book was reprinted by ...
