In [None]:
import os
import sqlite3
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import torch
from tqdm import tqdm  
import json

# Set device for CUDA
os.environ["CUDA_VISIBLE_DEVICES"] = "4"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the Sentence Transformer model
model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')
model = model.to(device)

def load_data(non_infringement_file, infringement_file):
    with open(non_infringement_file, 'r', encoding='utf-8') as file:
        non_infringement_json_data = json.load(file)

    # Extract input and reference text for non-infringement
    non_infringement_references = [entry['reference'] for entry in non_infringement_json_data]

    with open(infringement_file, 'r', encoding='utf-8') as file:
        infringement_json_data = json.load(file)

    # Extract input and reference text for infringement
    infringement_references = [entry['reference'] for entry in infringement_json_data]

    return (non_infringement_references, infringement_references)


# Example usage
non_infringement_references, \
infringement_references= load_data('/home/guangwei/LLM-COPYRIGHT/copyright_newVersion/test_division/extra_30.non_infringement.json', '/home/guangwei/LLM-COPYRIGHT/copyright_newVersion/test_division/extra_30.infringement.json')

references = non_infringement_references + infringement_references
# Sample references to be directly stored without splitting into chunks
# references = [
#     "of worms and an oozy smell, nor yet a dry, bare, sandy hole with nothing in it to sit down on or to eat: it was a hobbit-hole,",
#     "hair like the stuff on their heads (which is curly); have long clever brown fingers, good-natured faces,",
#     "of this hobbit—of Bilbo Baggins, that is—was the famous Belladonna Took, one of the three remarkable daughters of the Old Took, head of the hobbits who lived across The Water,",
#     "of pressure that you cannot withstand, even if you wished to. You will do what is required of you.' 'But what is it, what is it? How can I do it if I don't know what it is?' O'Brien picked up the cage and brought it across to",
#     "? What are you doing here? What time did you leave work? Is this your usual way home?'--and so on and so forth. Not that there was any rule against walking home by an unusual route: but it was enough to draw attention to you if the Thought Police heard",
#     "turned against Goldstein at all, but, on the contrary, against Big Brother, the Party, and the Thought Police; and at such moments his heart went out to the lonely, derided heretic on the screen, sole guardian of truth and sanity in a world of lies. And yet the very next",
#     ". Tyrion danced back in while the brigand's leg was still pinned beneath his fallen mount, and buried the axe in the man's neck, just above the shoulder blades. As he struggled to yank the blade loose, he heard Marillion moaning under the bodies. \"Someone help me,\" the singer"
# ]

# Encode references in batches
def batch_encode_references(model, references, batch_size=8):
    all_vectors = []
    for i in tqdm(range(0, len(references), batch_size), desc="Encoding references"):
        batch = references[i:i + batch_size]
        batch_vectors = model.encode(batch, convert_to_tensor=True, device=device)
        all_vectors.append(batch_vectors.cpu().numpy())
    return np.vstack(all_vectors)

# Encode the complete references
reference_vectors = batch_encode_references(model, references)

# Initialize FAISS index
dimension = reference_vectors.shape[1]
nlist = 3
quantizer = faiss.IndexFlatL2(dimension)
gpu_index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)

# Train and add vectors to the FAISS index
print("Training the index...")
gpu_index.train(reference_vectors) 
print("Index training completed.")
print("Adding vectors to the index...")
gpu_index.add(reference_vectors)
print("Vectors added to the index.")

# Setting up SQLite database to store text and embeddings
def setup_database():
    conn = sqlite3.connect('reference_db.sqlite')
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS reference_data (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            text TEXT NOT NULL,
            embedding BLOB NOT NULL
        )
    ''')
    conn.commit()
    return conn

# Function to store references and their embeddings directly in the database
def store_references(conn, references, embeddings):
    cursor = conn.cursor()
    for ref, emb in zip(references, embeddings):
        # Store each embedding as binary
        cursor.execute('INSERT INTO reference_data (text, embedding) VALUES (?, ?)', 
                       (ref, emb.tobytes()))
    conn.commit()

# Store references in the database
conn = setup_database()
store_references(conn, non_infringement_references, reference_vectors)


print("Length of references:", len(references))

# Function to find the most relevant reference based on input text
def search_next_sentence(input_text, top_k=1):
    print(f"Searching for next sentence for input: '{input_text}'...")
    input_vector = model.encode([input_text], convert_to_tensor=True, device=device).cpu().numpy()
    _, indices = gpu_index.search(input_vector, top_k)
    return [references[i] for i in indices[0]]

# Example input sentence
input_sentence = "out 'Swine! Swine! Swine!' and suddenly she picked up a heavy Newspeak dictionary and flung it at the screen. It struck Goldstein's nose and bounced off; the voice continued inexorably. In a lucid moment Winston found that he was shouting with the others and kicking his heel violently against the rung of his chair. The horrible thing about the Two Minutes Hate was not that one was obliged to act a part, but, on the contrary, that it was impossible to avoid joining in. Within thirty seconds any pretence was always unnecessary. A hideous ecstasy of fear and vindictiveness, a desire to kill, to torture, to smash faces in with a sledge-hammer, seemed to flow through the whole group of people like an electric current, turning one even against one's will into a grimacing, screaming lunatic. And yet the rage that one felt was an abstract, undirected emotion which could be switched from one object to another like the flame of a blowlamp. Thus, at one moment Winston's hatred was not"
next_sentence = search_next_sentence(input_sentence)
print("Recommended next sentence:", next_sentence)

# Close the database connection
conn.close()


Encoding references: 100%|██████████| 238/238 [00:11<00:00, 20.28it/s]


Training the index...
Index training completed.
Adding vectors to the index...
Vectors added to the index.
Length of references: 1898
Searching for next sentence for input: 'out 'Swine! Swine! Swine!' and suddenly she picked up a heavy Newspeak dictionary and flung it at the screen. It struck Goldstein's nose and bounced off; the voice continued inexorably. In a lucid moment Winston found that he was shouting with the others and kicking his heel violently against the rung of his chair. The horrible thing about the Two Minutes Hate was not that one was obliged to act a part, but, on the contrary, that it was impossible to avoid joining in. Within thirty seconds any pretence was always unnecessary. A hideous ecstasy of fear and vindictiveness, a desire to kill, to torture, to smash faces in with a sledge-hammer, seemed to flow through the whole group of people like an electric current, turning one even against one's will into a grimacing, screaming lunatic. And yet the rage that one felt