In [76]:
import numpy as np
import nltk
import os
import difflib
from gensim.models import Word2Vec
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.util import ngrams
import gensim.downloader as api
import re
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
nltk.download('punkt')

lemmatizer = WordNetLemmatizer() #lemmatizer algorithm
lancStemmer = LancasterStemmer()  # stemming algorithm Lancaster

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/galafloresgarcia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/galafloresgarcia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [77]:
def remove_stopwords(text):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    palabras = [palabra.lower() for palabra in re.findall(r'\w+', text.lower())]
    text_lista = []
    for palabra in palabras:
        if palabra not in stopwords:
            text_lista.append(palabra)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto


In [78]:
def get_lemmatizer(text):
    palabras = remove_stopwords(text)
    palabras = palabras.split()
    text_lista = []
    for palabra in palabras:
        nueva = lemmatizer.lemmatize(palabra)
        text_lista.append(nueva)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

In [79]:
def get_stemmer(text):
    palabras = remove_stopwords(text)
    palabras = palabras.split()
    text_lista = []
    for palabra in palabras:
        nueva = lancStemmer.stem(palabra)
        text_lista.append(nueva)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

In [80]:
def get_grams(text, ngram, method):
    result = []

    if method == 'lemmatize':
        text = get_lemmatizer(text)
        if ngram == 0:  # Si ngram es 0, se retorna el texto completo sin ngramas
            text = nltk.sent_tokenize(text)
            text = ' '.join(text)
            return text

        else:
            text = text.split()
            grams = ngrams(text, ngram)
            for ng in grams:
                result.append(' '.join(ng))
    elif method == 'stemmer':
        text = get_stemmer(text)
        if ngram == 0:  # Si ngram es 0, se retorna el texto completo sin ngramas
            text = nltk.sent_tokenize(text)
            text = ' '.join(text)
            return text

        else:
            text = text.split()
            grams = ngrams(text, ngram)
            for ng in grams:
                result.append(' '.join(ng))
    else:
        raise ValueError('Method not found')

    return result

In [43]:
text1 = "Now hes thinkin bout me every night, oh Is it that sweet? I guess so. Say you cant sleep, baby, I know Thats that me, espresso. Move it up, down, left, right, oh Switch it up like Nintendo. Say you can't sleep, baby, I know That's that me, espresso"

In [81]:
def token_sentence(text):
    sentences = nltk.sent_tokenize(text)
    filtered_sentences = []
    for sentence in sentences:
        filtered_words = get_lemmatizer(sentence)
        filtered_sentences.append(filtered_words)

    return filtered_sentences

In [50]:
token_sentence(text1)

['he thinkin bout every night oh sweet',
 'guess',
 'say cant sleep baby know thats espresso',
 'move left right oh switch like nintendo',
 'say sleep baby know espresso']

In [84]:
def preprocess_docs(folder_path, ngram, method):
    tagged_documents = []
    for fileid in os.listdir(folder_path):
        if fileid.endswith(".txt"):
            filepath = os.path.join(folder_path, fileid)
            
            with open(filepath, 'r', encoding='latin1', errors='ignore') as file:
                text = file.read()
                grams = get_grams(text, ngram, method)
                # Ensure words are split into a list of strings and then converted to tuple
                words = tuple(word.split() for word in grams)
                # Flatten the list of lists into a single list of strings
                words = [word for sublist in words for word in sublist]
                tagged_documents.append(TaggedDocument(words=words, tags=[fileid]))

    return tagged_documents

In [99]:
#Obtener n-gramas preprocesados
folder_path = "../../textos_plagiados"  # Ruta de la carpeta con los textos plagiados)
folder_path_og = "../../docs_originales"  # Ruta de la carpeta con los textos originales

tagged_originals = preprocess_docs(folder_path_og, 1, 'lemmatize')
tagged_plagiarized = preprocess_docs(folder_path, 1, 'lemmatize')  


In [100]:
def train_doc2vec(tagged_documents):
    model = Doc2Vec(vector_size=100, window=5, min_count=1, epochs=200,
                    dm=0)  # dm=0 for distributed bag of words (DBOW) mode
    model.build_vocab(tagged_documents)
    model.train(tagged_documents, total_examples=model.corpus_count, epochs=model.epochs)
    return model

In [87]:
def calculate_similarity_doc2vec(doc1, doc2, model):
    vec1 = model.infer_vector(doc1.words)
    vec2 = model.infer_vector(doc2.words)
    similarity = model.dv.similarity(doc1.tags[0], doc2.tags[0])
    return similarity

In [None]:
def detect_sentence_disorder(original_sentences, plagio_sentences):
    #cantidad de oraciones es diferente, hay desorden
    if len(original_sentences) != len(plagio_sentences):
        return True
    
    #verifica si el orden de las oraciones es diferente
    for original, plagio in zip(original_sentences, plagio_sentences):
        if original != plagio:
            return True
        
    return False

In [None]:
def detect_inserted_sentences(og_text, plagio_text):
    og_sentences = token_sentence(og_text)
    plagio_sentences = token_sentence(plagio_text)
    
    #si el plagio tiene mas oraciones que el original, hay oraciones insertadas
    if len(plagio_sentences) > len(og_sentences):
        return True
    
    #si el plagio tiene menos oraciones que el original, hay oraciones eliminadas
    if len(plagio_sentences) < len(og_sentences):
        return True
    
    #verifica si el orden de las oraciones es diferente
    if detect_sentence_disorder(og_sentences, plagio_sentences):
        return True
    
    return False

In [None]:
from nltk import pos_tag
def detect_time_change(og_text, plagio_text):
    original_verbs = [word for word, pos in nltk.pos_tag(nltk.word_tokenize(og_text)) if pos.startswith('VB')]
    suspicious_verbs = [word for word, pos in nltk.pos_tag(nltk.word_tokenize(plagio_text)) if pos.startswith('VB')]

    # Si la lista de verbos es diferente, hay un cambio de tiempo
    if set(original_verbs) != set(suspicious_verbs):
        return True
            
    return False

In [None]:
def detect_voice_change(og_text, plagio_text):
    original_verbs = [word for word, pos in nltk.pos_tag(nltk.word_tokenize(og_text)) if pos.startswith('VB')]
    suspicious_verbs = [word for word, pos in nltk.pos_tag(nltk.word_tokenize(plagio_text)) if pos.startswith('VB')]

    # Si la lista de verbos es diferente, hay un cambio de voz
    if set(original_verbs) != set(suspicious_verbs):
        return True
            
    return False

In [None]:
def detect_paraphrasing(og_text, plagio_text, model):
    similarity_threshold = 0.95  # Umbral de similitud para considerar el parafraseo

    similarity = calculate_similarity_doc2vec(og_text, plagio_text, model)
    if similarity >= similarity_threshold:
        return True
    else:
        return False

In [33]:
#PLAGIARISM TYPE
folder_path = "../../textos_plagiados"  # Ruta de la carpeta con los textos plagiados)
folder_path_og = "../../docs_originales"  # Ruta de la carpeta con los textos originales

tagged_originals = preprocess_docs(folder_path_og, 1, 'lemmatize')
tagged_plagiarized = preprocess_docs(folder_path, 1, 'lemmatize')  

model = train_doc2vec(tagged_originals + tagged_plagiarized)


def get_plagiarism_type(doc1, doc2, model):
    vec1 = model.infer_vector(doc1.words)
    vec2 = model.infer_vector(doc2.words)

    plagiarism_type = []
    for plagio_doc in tagged_plagiarized:
        max_similarity = 0
        most_similar = ''
        most_similar_doc = ''

        # Comparing with each original document
        for original_doc in tagged_originals:
            similarity = calculate_similarity_doc2vec(plagio_doc, original_doc, model)
            if similarity > max_similarity:
                max_similarity = similarity
                most_similar = original_doc.tags[0]
                most_similar_doc = original_doc.words

    plagiarism_type.append([plagio_doc.tags[0], most_similar, max_similarity, most_similar_doc])

In [103]:
# Obtener n-gramas preprocesados
folder_path = "../../textos_plagiados"  # Ruta de la carpeta con los textos plagiados)
folder_path_og = "../../docs_originales"  # Ruta de la carpeta con los textos originales


# Preprocessing original and plagiarized documents
tagged_originals = preprocess_docs(folder_path_og, 1, 'lemmatize')
tagged_plagiarized = preprocess_docs(folder_path, 1, 'lemmatize')

In [104]:
# Training the Doc2Vec model
model = train_doc2vec(tagged_originals + tagged_plagiarized)

In [105]:
#USA MODELO PARA TODOS LOS DOCUMENTOS FINAL


# List to store similarity results
similarity_results = []
#plagiarism_type = '' 

# Iterating over each plagiarized text
for plagio_doc in tagged_plagiarized:
    max_similarity = 0
    most_similar = ''
    most_similar_doc = ''

    # Comparing with each original document
    for original_doc in tagged_originals:
        similarity = calculate_similarity_doc2vec(plagio_doc, original_doc, model)
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar = original_doc.tags[0]
            most_similar_doc = original_doc.words

    similarity_results.append([plagio_doc.tags[0], most_similar, max_similarity, most_similar_doc])

        

# Sorting results by similarity in descending order
similarity_results.sort(key=lambda x: x[2], reverse=True)

# Printing results
for result in similarity_results:
    plagio_title, original_title, similarity_score, original_doc = result
    print(f"Similarity between '{plagio_title}' and '{original_title}': {similarity_score * 100:.2f}%")


Similarity between 'FID-04.txt' and 'org-045.txt': 99.72%
Similarity between 'FID-03.txt' and 'org-016.txt': 99.60%
Similarity between 'FID-05.txt' and 'org-085.txt': 96.26%
Similarity between 'FID-09.txt' and 'org-109.txt': 93.78%
Similarity between 'FID-08.txt' and 'org-079.txt': 92.20%
Similarity between 'FID-06.txt' and 'org-043.txt': 91.61%
Similarity between 'FID-07.txt' and 'org-041.txt': 89.16%
Similarity between 'FID-02.txt' and 'org-104.txt': 87.69%
Similarity between 'FID-10.txt' and 'org-007.txt': 86.32%
Similarity between 'FID-01.txt' and 'org-076.txt': 77.84%


In [None]:
def preprocess_docs2(folder_path, ngram, method):
    tagged_documents = []
    for fileid in os.listdir(folder_path):
        if fileid.endswith(".txt"):
            filepath = os.path.join(folder_path, fileid)
            
            with open(filepath, 'r', encoding='latin1', errors='ignore') as file:
                text = file.read()
                sentences = nltk.sent_tokenize(text)  # Tokenizar el texto en oraciones
                document_sentences = []  # Lista para almacenar las oraciones del documento

                for sentence in sentences:
                    grams = get_grams(sentence, ngram, method)
                    # Separar las palabras y agregarlas a la lista de oraciones del documento
                    words = [word for gram in grams for word in gram.split()]
                    document_sentences.append(words)
                
                tagged_documents.append(TaggedDocument(words=document_sentences, tags=[fileid]))

    return tagged_documents

In [90]:
#------------POR ORACION SIN MODELO -------------


def buscar_y_tokenizar(directorio, nombre_archivo):
    for filename in os.listdir(directorio):
        if filename == nombre_archivo:
            filepath = os.path.join(directorio, filename)
            with open(filepath, 'r', encoding='latin1', errors='ignore') as file:
                text = file.read()
                sentences = nltk.sent_tokenize(text)
                return sentences
    return None


def encontrar_coincidencias(sentences_originales, sentences_plagiados):
    coincidencias = []
    # Contadores para la matriz de auc
    TP = 0
    FP = 0
    TN = 0
    FN = 0



    for sentence_orig in sentences_originales:
        for sentence_plag in sentences_plagiados:
            matcher = difflib.SequenceMatcher(None, sentence_orig, sentence_plag)
            match = matcher.find_longest_match(0, len(sentence_orig), 0, len(sentence_plag))
            if match.size > 0:
                # Aplicar stemming y eliminar stopwords a las coincidencias antes de contar las palabras
                cadena_orig_clean = get_lemmatizer(sentence_orig[match.a:match.a + match.size])
                cadena_plag_clean = get_lemmatizer(sentence_plag[match.b:match.b + match.size])
                # Contar las palabras en las coincidencias después de aplicar el lemmatizer y eliminar las stopwords
                palabras_orig = cadena_orig_clean.split()
                palabras_plag = cadena_plag_clean.split()

                if len(palabras_orig) > 3 and len(palabras_plag) > 3:  # Solo considerar coincidencias con más de una palabra
                    coincidencias.append({
                        "cadena_orig": sentence_orig[match.a:match.a + match.size],
                        "cadena_plag": sentence_plag[match.b:match.b + match.size],
                        "longitud": match.size
                    })
                    # Actualizar contadores de la matriz de auc
                    if sentence_orig == sentence_plag:
                      TP += 1
                    else:
                      FP += 1
                """ #detecta_desorden_oraciones
                if detect_sentence_disorder(sentence_orig, sentence_plag) == True:
                  plagiarism_type = 'Desorden de oraciones'
                elif detect_time_change(sentence_orig, sentence_plag) == True:
                  plagiarism_type = 'Cambio de tiempo'
                elif sentence_orig == sentence_plag:
                  plagiarism_type = 'Insertar o reemplazar' """
                   
            else:
              if sentence_orig not in sentences_originales:
                TN += 1
              else:
                FN += 1
                
    matriz_auc = {
        'TP': TP,
        'FP': FP,
        'TN': TN,
        'FN': FN
    }
    # print(matriz_auc)

    return coincidencias, matriz_auc

total_coincidencias = []
total_TP = 0
total_FP = 0
total_TN = 0
total_FN = 0

for titulo in similarity_results:
    resultados = []
    sentences_originales = buscar_y_tokenizar(folder_path_og, titulo[1])
    sentences_plagiados = buscar_y_tokenizar(folder_path, titulo[0])
    print(f"Titulo: {titulo[0]}")

    if sentences_originales and sentences_plagiados:
        # Similitud = calcular_similitud_ngramas(sentences_originales, sentences_plagiados, 3)
        similitud = titulo[2]
        print(f"Similitud entre '{titulo[0]}' y '{titulo[1]}': {similitud * 100:.2f}%")
        coincidencias, matriz_auc = encontrar_coincidencias(sentences_originales, sentences_plagiados)
        # Actualizar contadores totales de la matriz de auc
        total_TP += matriz_auc['TP']
        total_FP += matriz_auc['FP']
        total_TN += matriz_auc['TN']
        total_FN += matriz_auc['FN']
        print(matriz_auc)
        print(f"Coincidencias para '{titulo[0]}' y '{titulo[1]}':")
        total_coincidencias.extend(coincidencias)

        print("----------------------------\n")
        for coincidencia in coincidencias:
            print(f"Cadena original: {coincidencia['cadena_orig']} (Longitud: {coincidencia['longitud']})")
            print(f"Cadena plagiada: {coincidencia['cadena_plag']}")
            print()
    else:
        print(f"No se encontraron oraciones en '{titulo[0]}' o '{titulo[1]}'")
        print()
    print("----------------------------\n")
    

# Calculando TPR, FPR y AUC
TPR = total_TP / (total_TP + total_FN) if (total_TP + total_FN) != 0 else 0
FPR = total_FP / (total_FP + total_TN) if (total_FP + total_TN) != 0 else 0
AUC = (1 + TPR - FPR) / 2

# Imprimiendo los valores calculados
print(f"TPR (Tasa de Verdaderos Positivos): {TPR:.2f}")
print(f"FPR (Tasa de Falsos Positivos): {FPR:.2f}")
print(f"AUC (Área bajo la curva ROC): {AUC:.2f}")

Titulo: FID-04.txt
Similitud entre 'FID-04.txt' y 'org-045.txt': 99.63%
{'TP': 9, 'FP': 0, 'TN': 0, 'FN': 0}
Coincidencias para 'FID-04.txt' y 'org-045.txt':
----------------------------

Cadena original: Interactive software agents, such as chatbots, are progressively being used in the area of health and well-being. (Longitud: 113)
Cadena plagiada: Interactive software agents, such as chatbots, are progressively being used in the area of health and well-being.

Cadena original: In such applications, where agents engage with users in interpersonal conversations for, e.g., coaching, comfort or behavior-change interventions, there is an increased need for understanding agentsâ empathic capabilities. (Longitud: 224)
Cadena plagiada: In such applications, where agents engage with users in interpersonal conversations for, e.g., coaching, comfort or behavior-change interventions, there is an increased need for understanding agentsâ empathic capabilities.

Cadena original: In the current 

{'TP': 14, 'FP': 7, 'TN': 0, 'FN': 0}
Coincidencias para 'FID-08.txt' y 'org-079.txt':
----------------------------

Cadena original: ï»¿The main idea of this paper is the substantiation of the methodological approach to the assessment of personnel risks of enterprises based on the application of the fuzzy logic apparatus in order to identify the problems of personnel risk management and provide appropriate recommendations for their solution. (Longitud: 312)
Cadena plagiada: ï»¿The main idea of this paper is the substantiation of the methodological approach to the assessment of personnel risks of enterprises based on the application of the fuzzy logic apparatus in order to identify the problems of personnel risk management and provide appropriate recommendations for their solution.

Cadena original: The methodological basis of the study is the classic provisions and fundamental works of foreign and domestic scientists, statistical data, the results of our research into the problems of 

In [None]:
def detect_time_change(original_text, suspicious_text):
    # Tokenización y etiquetado de partes del habla
    original_tokens = word_tokenize(original_text)
    suspicious_tokens = word_tokenize(suspicious_text)
    
    original_pos_tags = pos_tag(original_tokens)
    suspicious_pos_tags = pos_tag(suspicious_tokens)
    
    # Extracción de verbos y sus tiempos verbales
    original_verbs = [word for word, pos in original_pos_tags if pos.startswith('VB')]
    suspicious_verbs = [word for word, pos in suspicious_pos_tags if pos.startswith('VB')]
    
    # Comparación de tiempos verbales
    time_change_detected = False
    for original_verb, suspicious_verb in zip(original_verbs, suspicious_verbs):
        if original_verb != suspicious_verb:
            print("Cambio de tiempo verbal detectado:")
            print("Original:", original_verb)
            print("Sospechoso:", suspicious_verb)
            time_change_detected = True
    
    if not time_change_detected:
        print("No se detectaron cambios de tiempo verbal.")

In [None]:

def detect_voice_change(original_text, suspicious_text):
    # Tokenización y etiquetado de partes del habla
    original_tokens = word_tokenize(original_text)
    suspicious_tokens = word_tokenize(suspicious_text)
    
    original_pos_tags = pos_tag(original_tokens)
    suspicious_pos_tags = pos_tag(suspicious_tokens)
    
    # Extracción de verbos y sus formas base
    original_verbs = [word for word, pos in original_pos_tags if pos.startswith('VB')]
    suspicious_verbs = [word for word, pos in suspicious_pos_tags if pos.startswith('VB')]
    
    # Determinar si hay un cambio en la voz
    voice_change_detected = False
    for original_verb, suspicious_verb in zip(original_verbs, suspicious_verbs):
        original_voice = detect_verb_voice(original_verb)
        suspicious_voice = detect_verb_voice(suspicious_verb)
        
        if original_voice != suspicious_voice:
            print("Cambio de voz detectado:")
            print("Original:", original_verb, "(", original_voice, ")")
            print("Sospechoso:", suspicious_verb, "(", suspicious_voice, ")")
            voice_change_detected = True
    
    if not voice_change_detected:
        print("No se detectaron cambios de voz.")

def detect_verb_voice(verb):
    """
    Esta función utiliza una lista simple de verbos auxiliares
    para determinar si un verbo está en voz activa o pasiva.
    """
    active_verbs = ['am', 'is', 'are', 'was', 'were', 'be', 'being', 'been', 'have', 'has', 'had', 'do', 'does', 'did']
    if verb.lower() in active_verbs:
        return "Activa"
    else:
        return "Pasiva"