In [None]:
import os
import sqlite3
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import torch
from tqdm import tqdm
import json

# Set device for CUDA
os.environ["CUDA_VISIBLE_DEVICES"] = "4"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the Sentence Transformer model
model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')
model = model.to(device)

# Load JSON data and prepare pairs
def load_data(non_infringement_file, infringement_file):
    with open(non_infringement_file, 'r', encoding='utf-8') as file:
        non_infringement_json_data = json.load(file)

    # Extract input and reference text for non-infringement
    non_infringement_inputs = [entry['input'] for entry in non_infringement_json_data]
    non_infringement_references = [entry['reference'] for entry in non_infringement_json_data]

    with open(infringement_file, 'r', encoding='utf-8') as file:
        infringement_json_data = json.load(file)

    # Extract input and reference text for infringement
    infringement_inputs = [entry['input'] for entry in infringement_json_data]
    infringement_references = [entry['reference'] for entry in infringement_json_data]

    # Create structured matching pairs
    non_infringement_pairs = list(zip(non_infringement_inputs, non_infringement_references))
    infringement_pairs = list(zip(infringement_inputs, infringement_references))

    # Combine all pairs into a single list
    all_pairs = non_infringement_pairs + infringement_pairs
    return all_pairs

# Example usage
all_pairs = load_data(
    '/home/guangwei/LLM-COPYRIGHT/copyright_newVersion/test_division/extra_30.non_infringement.json',
    '/home/guangwei/LLM-COPYRIGHT/copyright_newVersion/test_division/extra_30.infringement.json'
)

# Extract `input` texts for embedding
input_texts = [pair[0] for pair in all_pairs]
references = [pair[1] for pair in all_pairs]

# Encode `input` texts in batches
def batch_encode_texts(model, texts, batch_size=8):
    all_vectors = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding inputs"):
        batch = texts[i:i + batch_size]
        batch_vectors = model.encode(batch, convert_to_tensor=True, device=device)
        all_vectors.append(batch_vectors.cpu().numpy())
    return np.vstack(all_vectors)

# Encode the `input` texts
input_vectors = batch_encode_texts(model, input_texts)

# Initialize FAISS index for `input` vectors
dimension = input_vectors.shape[1]
nlist = 3  # Example number of clusters
quantizer = faiss.IndexFlatL2(dimension)
gpu_index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)

# Train and add vectors to the FAISS index
print("Training the index on input vectors...")
gpu_index.train(input_vectors)
print("Index training completed.")
print("Adding input vectors to the index...")
gpu_index.add(input_vectors)
print("Input vectors added to the index.")

# Setting up SQLite database to store text and embeddings
def setup_database():
    conn = sqlite3.connect('reference_db.sqlite')
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS reference_data (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            input_text TEXT NOT NULL,
            reference_text TEXT NOT NULL,
            embedding BLOB NOT NULL
        )
    ''')
    conn.commit()
    return conn

# Store `input` and `reference` pairs in the database
def store_pairs(conn, inputs, references, embeddings):
    cursor = conn.cursor()
    for inp, ref, emb in zip(inputs, references, embeddings):
        cursor.execute('INSERT INTO reference_data (input_text, reference_text, embedding) VALUES (?, ?, ?)', 
                       (inp, ref, emb.tobytes()))
    conn.commit()

# Store pairs in the database
conn = setup_database()
store_pairs(conn, input_texts, references, input_vectors)

# Function to find the most relevant reference based on input text
def search_most_similar_reference(input_sentence, top_k=1):
    print(f"Searching for most similar reference for input: '{input_sentence}'...")
    input_vector = model.encode([input_sentence], convert_to_tensor=True, device=device).cpu().numpy()
    _, indices = gpu_index.search(input_vector, top_k)
    # Retrieve the most similar `input`'s corresponding `reference`
    return [references[i] for i in indices[0]]



# Example input sentence
input_sentence = "of his spine. There were times when it went on and on until the cruel, wicked, unforgivable thing seemed to him not that the guards continued to beat him but that he could not force hirnself into losing consciousness. There were times when his nerve so forsook him that he began shouting for mercy even before the beating began, when the mere sight of a fist drawn back for a blow was enough to make him pour forth a confession of real and imaginary crimes. There were other times when he started out with the resolve of confessing nothing, when every word had to be forced out of him between gasps of pain, and there were times when he feebly tried to compromise, when he said to himself: 'I will confess, but not yet. I must hold out till the pain becomes unbearable. Three more kicks, two more kicks, and then I will tell them what they want.' Sometimes he was beaten till he could hardly stand, then flung like a sack of potatoes on to the stone floor of a"
most_similar_reference = search_most_similar_reference(input_sentence)
print("Most similar reference:", most_similar_reference)

# Close the database connection
conn.close()


Encoding inputs: 100%|██████████| 238/238 [00:09<00:00, 25.62it/s]


Training the index on input vectors...
Index training completed.
Adding input vectors to the index...
Input vectors added to the index.
Searching for most similar reference for input: 'of his spine. There were times when it went on and on until the cruel, wicked, unforgivable thing seemed to him not that the guards continued to beat him but that he could not force hirnself into losing consciousness. There were times when his nerve so forsook him that he began shouting for mercy even before the beating began, when the mere sight of a fist drawn back for a blow was enough to make him pour forth a confession of real and imaginary crimes. There were other times when he started out with the resolve of confessing nothing, when every word had to be forced out of him between gasps of pain, and there were times when he feebly tried to compromise, when he said to himself: 'I will confess, but not yet. I must hold out till the pain becomes unbearable. Three more kicks, two more kicks, and then I 