In [8]:
import os
import sqlite3
import numpy as np
import faiss
import torch
from sentence_transformers import SentenceTransformer

# Configuration for device
os.environ["CUDA_VISIBLE_DEVICES"] = "4"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load pre-trained model
model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')
model = model.to(device)

# Function to load embeddings from the database
def load_data_from_db(db_path='reference_db.sqlite'):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # Retrieve input texts, reference texts, and embeddings
    cursor.execute("SELECT input_text, reference_text, embedding FROM reference_data")
    data = cursor.fetchall()
    
    input_texts = []
    references = []
    input_vectors = []
    
    for inp, ref, emb in data:
        input_texts.append(inp)
        references.append(ref)
        input_vectors.append(np.frombuffer(emb, dtype=np.float32))
    
    conn.close()
    input_vectors = np.vstack(input_vectors)
    return input_texts, references, input_vectors

# Function to load the FAISS index with vectors
def load_faiss_index(dimension, input_vectors):
    nlist = 3  # Number of clusters
    quantizer = faiss.IndexFlatL2(dimension)
    gpu_index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)
    
    # Train and add vectors to the index
    gpu_index.train(input_vectors)
    gpu_index.add(input_vectors)
    return gpu_index

# Function to perform the search and retrieve the most similar reference
def search_most_similar_reference(input_sentence, top_k=1, db_path='reference_db.sqlite'):
    # Load data from the database
    input_texts, references, input_vectors = load_data_from_db(db_path)
    
    # Load the FAISS index
    gpu_index = load_faiss_index(dimension=input_vectors.shape[1], input_vectors=input_vectors)
    
    # Encode input sentence
    input_vector = model.encode([input_sentence], convert_to_tensor=True, device=device).cpu().numpy()
    
    # Perform search on FAISS index
    _, indices = gpu_index.search(input_vector, top_k)
    
    # Retrieve the most similar reference(s)
    return [references[i] for i in indices[0]]

# Example usage of the search function
if __name__ == "__main__":
    input_sentence = "of his spine. There were times when it went on and on until the cruel, wicked, unforgivable thing seemed to him not that the guards continued to beat him but that he could not force himself into losing consciousness..."
    most_similar_reference = search_most_similar_reference(input_sentence)
    print("Most similar reference:", most_similar_reference)


Most similar reference: ['cell, left to recuperate for a few hours, and then taken out and beaten again. There were also longer periods of recovery. He remembered them dimly, because they were spent chiefly in sleep or stupor. He remembered a cell with a plank bed, a sort of shelf sticking out from']
