In [28]:
import numpy as np
import nltk
import os
import difflib
from gensim.models import Word2Vec
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.util import ngrams
import gensim.downloader as api
import re
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
nltk.download('punkt')

lemmatizer = WordNetLemmatizer() #lemmatizer algorithm
lancStemmer = LancasterStemmer()  # stemming algorithm Lancaster

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sergiogonzalez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sergiogonzalez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Preprocesamiento de texto

In [29]:
def remove_stopwords(text):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    palabras = [palabra.lower() for palabra in re.findall(r'\w+', text.lower())]
    text_lista = []
    for palabra in palabras:
        if palabra not in stopwords:
            text_lista.append(palabra)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto


In [30]:
def get_lemmatizer(text):
    palabras = remove_stopwords(text)
    palabras = palabras.split()
    text_lista = []
    for palabra in palabras:
        nueva = lemmatizer.lemmatize(palabra)
        text_lista.append(nueva)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

In [31]:
def get_stemmer(text):
    palabras = remove_stopwords(text)
    palabras = palabras.split()
    text_lista = []
    for palabra in palabras:
        nueva = lancStemmer.stem(palabra)
        text_lista.append(nueva)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

In [32]:
def get_grams(text, ngram, method):
    result = []

    if method == 'lemmatize':
        text = get_lemmatizer(text)
        if ngram == 0:  # Si ngram es 0, se retorna el texto completo sin ngramas
            text = nltk.sent_tokenize(text)
            text = ' '.join(text)
            return text

        else:
            text = text.split()
            grams = ngrams(text, ngram)
            for ng in grams:
                result.append(' '.join(ng))
    elif method == 'stemmer':
        text = get_stemmer(text)
        if ngram == 0:  # Si ngram es 0, se retorna el texto completo sin ngramas
            text = nltk.sent_tokenize(text)
            text = ' '.join(text)
            return text

        else:
            text = text.split()
            grams = ngrams(text, ngram)
            for ng in grams:
                result.append(' '.join(ng))
    else:
        raise ValueError('Method not found')

    return result

In [33]:
def token_sentence(text):
    sentences = nltk.sent_tokenize(text)
    filtered_sentences = []
    for sentence in sentences:
        filtered_words = get_lemmatizer(sentence)
        filtered_sentences.append(filtered_words)

    return filtered_sentences

# Deteccion de plagio

In [34]:
def preprocess_docs(folder_path, ngram, method):
    tagged_documents = []
    for fileid in os.listdir(folder_path):
        if fileid.endswith(".txt"):
            filepath = os.path.join(folder_path, fileid)
            
            with open(filepath, 'r', encoding='latin1', errors='ignore') as file:
                text = file.read()
                grams = get_grams(text, ngram, method)
                # Ensure words are split into a list of strings and then converted to tuple
                words = tuple(word.split() for word in grams)
                # Flatten the list of lists into a single list of strings
                words = [word for sublist in words for word in sublist]
                tagged_documents.append(TaggedDocument(words=words, tags=[fileid]))

    return tagged_documents

In [35]:
def preprocess_docs_with_sentence(folder_path, ngram, method):
    tagged_documents = []
    for fileid in os.listdir(folder_path):
        if fileid.endswith(".txt"):
            filepath = os.path.join(folder_path, fileid)
            
            with open(filepath, 'r', encoding='latin1', errors='ignore') as file:
                text = file.read()
                sentences = nltk.sent_tokenize(text)  # Tokenizar el texto en oraciones
                document_sentences = []  # Lista para almacenar las oraciones del documento

                for sentence in sentences:
                    grams = get_grams(sentence, ngram, method)
                    # Separar las palabras y agregarlas a la lista de oraciones del documento
                    words = [word for gram in grams for word in gram.split()]
                    document_sentences.append(words)
                
                tagged_documents.append(TaggedDocument(words=document_sentences, tags=[fileid]))

    return tagged_documents

# Deteccion de Tipo de plagio

In [36]:
def detect_sentence_disorder(original_sentences, plagio_sentences):
    #cantidad de oraciones es diferente, hay desorden
    if len(original_sentences) != len(plagio_sentences):
        return True
    
    #verifica si el orden de las oraciones es diferente
    for original, plagio in zip(original_sentences, plagio_sentences):
        if original != plagio:
            return True
        
    return False

In [37]:
from nltk import pos_tag
def detect_time_change(og_text, plagio_text):
    original_verbs = [word for word, pos in nltk.pos_tag(nltk.word_tokenize(og_text)) if pos.startswith('VB')]
    suspicious_verbs = [word for word, pos in nltk.pos_tag(nltk.word_tokenize(plagio_text)) if pos.startswith('VB')]

    # Si la lista de verbos es diferente, hay un cambio de tiempo
    if set(original_verbs) != set(suspicious_verbs):
        return True
            
    return False

In [38]:
def detect_inserted_sentences(og_text, plagio_text):
    og_sentences = token_sentence(og_text)
    plagio_sentences = token_sentence(plagio_text)
    
    #si el plagio tiene mas oraciones que el original, hay oraciones insertadas
    if len(plagio_sentences) > len(og_sentences):
        return True
    
    #si el plagio tiene menos oraciones que el original, hay oraciones eliminadas
    if len(plagio_sentences) < len(og_sentences):
        return True
    
    #verifica si el orden de las oraciones es diferente
    if detect_sentence_disorder(og_sentences, plagio_sentences):
        return True
    
    return False

In [39]:
def detect_voice_change(og_text, plagio_text):
    original_verbs = [word for word, pos in nltk.pos_tag(nltk.word_tokenize(og_text)) if pos.startswith('VB')]
    suspicious_verbs = [word for word, pos in nltk.pos_tag(nltk.word_tokenize(plagio_text)) if pos.startswith('VB')]

    # Si la lista de verbos es diferente, hay un cambio de voz
    if set(original_verbs) != set(suspicious_verbs):
        return True
            
    return False

In [40]:
def detect_paraphrasing(og_text, plagio_text, model):
    similarity_threshold = 0.95  # Umbral de similitud para considerar el parafraseo

    similarity = calculate_similarity_doc2vec(og_text, plagio_text, model)
    if similarity >= similarity_threshold:
        return True
    else:
        return False

# Generacion de modelo, entrenamiento y calculo de similitud

In [41]:
def train_doc2vec(tagged_documents):
    model = Doc2Vec(vector_size=100, window=5, min_count=1, epochs=200,
                    dm=0)  # dm=0 for distributed bag of words (DBOW) mode
    model.build_vocab(tagged_documents)
    model.train(tagged_documents, total_examples=model.corpus_count, epochs=model.epochs)
    return model

In [42]:
def calculate_similarity_doc2vec(doc1, doc2, model):
    vec1 = model.infer_vector(doc1.words)
    vec2 = model.infer_vector(doc2.words)
    similarity = model.dv.similarity(doc1.tags[0], doc2.tags[0])
    return similarity

# APLICACION DE MODELO

Preprocesamiento de documentos originales y plagiados

In [43]:
# Obtener n-gramas preprocesados
folder_path = "../../textos_plagiados"  # Ruta de la carpeta con los textos plagiados)
folder_path_og = "../../docs_originales"  # Ruta de la carpeta con los textos originales


# Preprocessing original and plagiarized documents
tagged_originals = preprocess_docs(folder_path_og, 1, 'lemmatize')
tagged_plagiarized = preprocess_docs(folder_path, 1, 'lemmatize')

# Preprocessing original and plagiarized documents with sentences
tagged_originals_with_sentence = preprocess_docs_with_sentence(folder_path_og, 1, 'lemmatize')
tagged_plagiarized_with_sentence = preprocess_docs_with_sentence(folder_path, 1, 'lemmatize')

###  Entrenamiento del modelo Doc2Vec 

In [44]:
# sin sentence
model = train_doc2vec(tagged_originals + tagged_plagiarized)

In [45]:
# List to store similarity results
similarity_results = []
#plagiarism_type = '' 

# Iterating over each plagiarized text
for plagio_doc in tagged_plagiarized:
    max_similarity = 0
    most_similar = ''
    most_similar_doc = ''

    # Comparing with each original document
    for original_doc in tagged_originals:
        similarity = calculate_similarity_doc2vec(plagio_doc, original_doc, model)
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar = original_doc.tags[0]
            most_similar_doc = original_doc.words

    similarity_results.append([plagio_doc.tags[0], most_similar, max_similarity, most_similar_doc])

        

# Sorting results by similarity in descending order
similarity_results.sort(key=lambda x: x[2], reverse=True)

# Printing results
for result in similarity_results:
    plagio_title, original_title, similarity_score, original_doc = result
    print(f"Similarity between '{plagio_title}' and '{original_title}': {similarity_score * 100:.2f}%")


Similarity between 'FID-04.txt' and 'org-045.txt': 99.62%
Similarity between 'FID-03.txt' and 'org-016.txt': 99.52%
Similarity between 'FID-05.txt' and 'org-085.txt': 96.22%
Similarity between 'FID-09.txt' and 'org-109.txt': 93.65%
Similarity between 'FID-08.txt' and 'org-079.txt': 92.48%
Similarity between 'FID-06.txt' and 'org-043.txt': 91.86%
Similarity between 'FID-07.txt' and 'org-041.txt': 88.51%
Similarity between 'FID-02.txt' and 'org-104.txt': 88.25%
Similarity between 'FID-10.txt' and 'org-007.txt': 86.68%
Similarity between 'FID-01.txt' and 'org-076.txt': 78.19%


### Entrenamiento del modelo Doc2Vec con oraciones

In [46]:
# con sentence
# model_with_sentence = train_doc2vec(tagged_originals_with_sentence + tagged_plagiarized_with_sentence)

In [47]:
def get_plagiarism_type(doc1, doc2, model):
    vec1 = model.infer_vector(doc1.words)
    vec2 = model.infer_vector(doc2.words)

    plagiarism_type = []
    for plagio_doc in tagged_plagiarized:
        max_similarity = 0
        most_similar = ''
        most_similar_doc = ''

        # Comparing with each original document
        for original_doc in tagged_originals:
            similarity = calculate_similarity_doc2vec(plagio_doc, original_doc, model)
            if similarity > max_similarity:
                max_similarity = similarity
                most_similar = original_doc.tags[0]
                most_similar_doc = original_doc.words

    plagiarism_type.append([plagio_doc.tags[0], most_similar, max_similarity, most_similar_doc])

### TESTING OTROS

In [48]:
original_folder = "../../docs_originales"
plagiarized_folder = "../../textos_plagiados"

# Clasificar tipo de plagio GALA

In [49]:
# # Classify plagiarism type 
# def classify_plagiarism_type(original_doc, plagiarized_doc):
#     # Preprocesa los documentos
#     original_processed = preprocess(original_doc)
#     plagiarized_processed = preprocess(plagiarized_doc)
# 
#     # Tokeniza los documentos en frases
#     original_sentences = nltk.sent_tokenize(original_processed)
#     plagiarized_sentences = nltk.sent_tokenize(plagiarized_processed)
# 
#     # Comprueba si el plagio involucra insertar o reemplazar frases
#     if len(original_sentences) < len(plagiarized_sentences):
#         return "Insertar o reemplazar frases"
# 
#     # Comprueba si el plagio involucra desordenar las frases
#     if set(original_sentences) != set(plagiarized_sentences):
#         return "Desordenar las frases"
# 
#     # Comprueba si el plagio involucra cambio de tiempo
#     # Implementa tu lógica aquí...
# 
#     # Comprueba si el plagio involucra cambio de voz
#     # Implementa tu lógica aquí...
# 
#     # Comprueba si el plagio involucra parafraseo
#     if original_processed != plagiarized_processed:
#         return "Parafraseo"
# 
#     # Si no se identifica ningún tipo específico de plagio, se devuelve un mensaje genérico
#     return "Tipo de plagio no identificado"
# 
# # Ejemplo de uso:
# original_doc = "El cambio climático es un problema global."
# plagiarized_doc = "El cambio climático se convierte en una preocupación a nivel mundial."
# 
# plagiarism_type = classify_plagiarism_type(original_doc, plagiarized_doc)
# print("Tipo de plagio:", plagiarism_type)


# Clasificar tipo de plagio SERGIO

In [50]:
import nltk
from difflib import SequenceMatcher

nltk.download('punkt')

def detectar_tipo_plagio(original, plagio):
    tokens_original = nltk.word_tokenize(original)
    tokens_plagio = nltk.word_tokenize(plagio)
    similitud = SequenceMatcher(None, tokens_original, tokens_plagio).ratio()
    if similitud == 1.0:
        return "Parafraseo"
    elif similitud >= 0.8:
        return "Insertar o reemplazar frases"
    elif similitud >= 0.6:
        return "Cambio de tiempo"
    elif similitud >= 0.4:
        return "Desordenar las frases"
    else:
        return "Cambio de voz"

# Recorrer cada par de documentos en similarity_results
for result in similarity_results:
    plagio_title, original_title, similarity_score, original_doc = result
    original_doc_text = ' '.join([word for word in original_doc])
    plagiarized_doc_text = [doc.words for doc in tagged_plagiarized if doc.tags[0] == plagio_title][0]
    plagiarized_doc_text = ' '.join([word for word in plagiarized_doc_text])

    # print(f"Titulo: {plagio_title}")
    print(f"Similitud entre '{plagio_title}' y '{original_title}': {similarity_score * 100:.2f}%")
    tipo_plagio = detectar_tipo_plagio(original_doc_text, plagiarized_doc_text)
    print("Tipo de plagio:", tipo_plagio)
    # print("Coincidencias para el plagio:")
    print("----------------------------")
    # print(f"Cadena original: (Longitud: {len(original_doc_text)})")
    # print(f"Cadena plagiada:  (Longitud: {len(plagiarized_doc_text)})")
    # print()


Similitud entre 'FID-04.txt' y 'org-045.txt': 99.62%
Tipo de plagio: Desordenar las frases
----------------------------
Similitud entre 'FID-03.txt' y 'org-016.txt': 99.52%
Tipo de plagio: Cambio de tiempo
----------------------------
Similitud entre 'FID-05.txt' y 'org-085.txt': 96.22%
Tipo de plagio: Insertar o reemplazar frases
----------------------------
Similitud entre 'FID-09.txt' y 'org-109.txt': 93.65%
Tipo de plagio: Cambio de tiempo
----------------------------
Similitud entre 'FID-08.txt' y 'org-079.txt': 92.48%
Tipo de plagio: Cambio de tiempo
----------------------------
Similitud entre 'FID-06.txt' y 'org-043.txt': 91.86%
Tipo de plagio: Insertar o reemplazar frases
----------------------------
Similitud entre 'FID-07.txt' y 'org-041.txt': 88.51%
Tipo de plagio: Cambio de tiempo
----------------------------
Similitud entre 'FID-02.txt' y 'org-104.txt': 88.25%
Tipo de plagio: Cambio de tiempo
----------------------------
Similitud entre 'FID-10.txt' y 'org-007.txt': 86.68

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sergiogonzalez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# INTENTO DE MEJORA

In [51]:
import nltk
from difflib import SequenceMatcher

nltk.download('punkt')

def detectar_tipo_plagio(original, plagio):
    tokens_original = nltk.word_tokenize(original)
    tokens_plagio = nltk.word_tokenize(plagio)
    similitud = SequenceMatcher(None, tokens_original, tokens_plagio).ratio()
    
    # Obtener tokens de n-gramas
    ngrams_original = set(nltk.ngrams(tokens_original, 2))
    ngrams_plagio = set(nltk.ngrams(tokens_plagio, 2))
    
    print("ngrams originales: ", ngrams_original)
    print("ngrams plagio: ", ngrams_plagio)
    # Verificar la cantidad de n-gramas compartidos
    shared_ngrams = len(ngrams_original.intersection(ngrams_plagio))
    total_ngrams = len(ngrams_original.union(ngrams_plagio))
    ratio_ngrams = shared_ngrams / total_ngrams
    
    # Mejorar la detección de tipo de plagio utilizando más criterios
    if similitud == 1.0:
        return "Parafraseo"
    elif ratio_ngrams > 0.8:
        return "Parafraseo o reordenamiento de frases"
    elif ratio_ngrams > 0.5:
        return "Reordenamiento de frases"
    elif similitud > 0.8:
        return "Parafraseo o inserción de frases"
    elif similitud > 0.6:
        return "Inserción de frases"
    elif similitud > 0.4:
        return "Cambio de tiempo o desorden de frases"
    else:
        return "Cambio de voz"

# Recorrer cada par de documentos en similarity_results
for result in similarity_results:
    plagio_title, original_title, similarity_score, original_doc = result
    original_doc_text = ' '.join([word for word in original_doc])
    plagiarized_doc_text = [doc.words for doc in tagged_plagiarized if doc.tags[0] == plagio_title][0]
    plagiarized_doc_text = ' '.join([word for word in plagiarized_doc_text])

    print(f"Titulo: {plagio_title}")
    print(f"Similitud entre '{plagio_title}' y '{original_title}': {similarity_score * 100:.2f}%")
    tipo_plagio = detectar_tipo_plagio(original_doc_text, plagiarized_doc_text)
    print("Tipo de plagio:", tipo_plagio)
    print("Coincidencias para el plagio:")
    print("----------------------------")
    print(f"Cadena original: (Longitud: {len(original_doc_text)})")
    print(f"Cadena plagiada:  (Longitud: {len(plagiarized_doc_text)})")
    print()


Titulo: FID-04.txt
Similitud entre 'FID-04.txt' y 'org-045.txt': 99.62%
ngrams originales:  {('order', 'understand'), ('empathic', 'performance'), ('owl', 'may'), ('performance', 'intelligent'), ('intervention', 'increased'), ('language', 'owl'), ('engage', 'user'), ('user', 'interpersonal'), ('formal', 'definition'), ('wysa', 'finding'), ('variety', 'definition'), ('definition', 'capture'), ('behavior', 'change'), ('capture', 'necessary'), ('implemented', 'web'), ('interactive', 'software'), ('agent', 'chatbots'), ('interpersonal', 'conversation'), ('need', 'understanding'), ('user', 'study'), ('necessary', 'condition'), ('well', 'chatbots'), ('changing', 'perception'), ('empathy', 'two'), ('ontologyâ', 'empathy'), ('well', 'formal'), ('notion', 'empathy'), ('uncover', 'explain'), ('progressively', 'used'), ('software', 'agent'), ('precise', 'notion'), ('may', 'serve'), ('enabling', 'system'), ('definition', 'controlled'), ('agent', 'engage'), ('evaluating', 'empathic'), ('interactive

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sergiogonzalez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
