In [5]:
import numpy as np
from gensim.models import FastText
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
documents = [
    ["le", "chat", "noir", "dort", "sur", "le", "canapé"],
    ["un", "chien", "aboie", "dans", "le", "jardin"],
    ["le", "soleil", "brille", "dans", "le", "ciel"],
    ["un", "chat", "noir", "est", "allongé", "sur", "le", "divan"]
]

In [8]:
# Entraînement du modèle FastText
model_fasttext = FastText(documents, vector_size=100, window=3, min_count=1)


In [9]:
# Fonction pour encoder un document
def get_embedding(document):
    vectors = [model_fasttext.wv[word] for word in document if word in model_fasttext.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

# Stocker les embeddings des documents
doc_embeddings = np.array([get_embedding(doc) for doc in documents])

# Fonction de détection de plagiat
def detect_plagiarism(submitted_text):
    submitted_embedding = get_embedding(submitted_text)
    similarities = cosine_similarity([submitted_embedding], doc_embeddings)[0]
    max_similarity = max(similarities)
    
    print(f"\nDétection de plagiat : Score de similarité max = {max_similarity:.2f}")
    if max_similarity > 0.8:
        print("Plagiat détecté !")
    else:
        print("Aucun plagiat détecté.")

In [10]:
# Exemple d'utilisation
submitted_text = ["un", "chat", "noir", "se", "repose", "sur", "le", "canapé"]
detect_plagiarism(submitted_text)


Détection de plagiat : Score de similarité max = 0.65
Aucun plagiat détecté.
