In [None]:
import sqlite3
from sentence_transformers import SentenceTransformer, util
from IPython.display import display
import ipywidgets as widgets

# --- SETTINGS ---
PLAGIARISM_THRESHOLD = 0.7
DB_NAME = "documents_semantic.db"

# Load SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # lightweight, semantic

# Connect to database
conn = sqlite3.connect(DB_NAME)
cursor = conn.cursor()

# Create documents table if not exists
cursor.execute("""
    CREATE TABLE IF NOT EXISTS documents (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        filename TEXT,
        content TEXT
    )
""")
conn.commit()



In [None]:
# --- FUNCTIONS ---

def get_all_documents():
    cursor.execute("SELECT filename, content FROM documents")
    return cursor.fetchall()

def check_plagiarism_semantic(new_text, existing_docs, threshold=PLAGIARISM_THRESHOLD):
    new_emb = model.encode(new_text, convert_to_tensor=True)
    for filename, content in existing_docs:
        old_emb = model.encode(content, convert_to_tensor=True)
        sim = util.pytorch_cos_sim(new_emb, old_emb).item()
        if sim >= threshold:
            return True, filename, sim
    return False, None, 0.0

def save_to_db(filename, content):
    cursor.execute("INSERT INTO documents (filename, content) VALUES (?, ?)", (filename, content))
    conn.commit()

# --- UPLOAD INTERFACE ---

upload = widgets.FileUpload(accept='.txt', multiple=False)
display(upload)

def handle_uploaded_file(change):
    if upload.value:
        uploaded = next(iter(upload.value.values()))
        filename = uploaded['metadata']['name']
        content = uploaded['content'].decode("utf-8").strip()

        if not content:
            print("File is empty.")
            return

        existing_docs = get_all_documents()
        is_plag, matched_file, score = check_plagiarism_semantic(content, existing_docs)

        if is_plag:
            print(f"Plagiarism detected! Similar to '{matched_file}' with {score * 100:.2f}% similarity.")
        else:
            save_to_db(filename, content)
            print(f"'{filename}' added to database. No plagiarism detected.")

upload.observe(handle_uploaded_file, names='value')
