In [1]:
import numpy as np
import nltk
import os
import difflib
from gensim.models import Word2Vec
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.util import ngrams
import gensim.downloader as api
import re
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
nltk.download('punkt')
from nltk import pos_tag

lemmatizer = WordNetLemmatizer() #lemmatizer algorithm
lancStemmer = LancasterStemmer()  # stemming algorithm Lancaster

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/galafloresgarcia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/galafloresgarcia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### PREPROCESAMIENTO

In [2]:
def remove_stopwords(text):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    palabras = [palabra.lower() for palabra in re.findall(r'\w+', text.lower())]
    text_lista = []
    for palabra in palabras:
        if palabra not in stopwords:
            text_lista.append(palabra)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

def get_lemmatizer(text):
    palabras = remove_stopwords(text)
    palabras = palabras.split()
    text_lista = []
    for palabra in palabras:
        nueva = lemmatizer.lemmatize(palabra)
        text_lista.append(nueva)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

def get_stemmer(text):
    palabras = remove_stopwords(text)
    palabras = palabras.split()
    text_lista = []
    for palabra in palabras:
        nueva = lancStemmer.stem(palabra)
        text_lista.append(nueva)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

In [3]:
def get_grams(text, ngram, method):
    result = []

    if method == 'lemmatize':
        text = get_lemmatizer(text)
        if ngram == 0:  # Si ngram es 0, se retorna el texto completo sin ngramas
            text = nltk.sent_tokenize(text)
            text = ' '.join(text)
            return text

        else:
            text = text.split()
            grams = ngrams(text, ngram)
            for ng in grams:
                result.append(' '.join(ng))
    elif method == 'stemmer':
        text = get_stemmer(text)
        if ngram == 0:  # Si ngram es 0, se retorna el texto completo sin ngramas
            text = nltk.sent_tokenize(text)
            text = ' '.join(text)
            return text

        else:
            text = text.split()
            grams = ngrams(text, ngram)
            for ng in grams:
                result.append(' '.join(ng))
    else:
        raise ValueError('Method not found')

    return result

### PROCESAMIENTO
Para documentos totales (sergio)

In [52]:
def preprocess_docs(folder_path, ngram, method):
    """
    Esta función recibe la ruta de una carpeta con documentos de texto 
    y retorna una lista de documentos preprocesados y taggeados para el modelo
    """
    tagged_documents = []
    for fileid in os.listdir(folder_path):
        if fileid.endswith(".txt"):
            filepath = os.path.join(folder_path, fileid)
            
            with open(filepath, 'r', encoding='latin1', errors='ignore') as file:
                text = file.read()
                # Preprocesamiento de texto
                grams = get_grams(text, ngram, method)
                # Ensure words are split into a list of strings and then converted to tuple
                words = tuple(word.split() for word in grams)
                # Flatten the list of lists into a single list of strings
                words = [word for sublist in words for word in sublist]
                tagged_documents.append(TaggedDocument(words=words, tags=[fileid]))

    return tagged_documents

Obtiene los documentos tageados y sus palabras de los documentos preprocesadas en una listad de listas

In [53]:
folder_path = "../../textos_plagiados"  # Ruta de la carpeta con los textos plagiados)
folder_path_og = "../../docs_originales"  # Ruta de la carpeta con los textos originales

tagged_originals = preprocess_docs(folder_path_og, 1, 'lemmatize')
tagged_plagiarized = preprocess_docs(folder_path, 1, 'lemmatize')  

## ENTRENAMIENTO DE MODELO 
aqui voy a poner solo los originales para entrenamiento

In [54]:
def train_doc2vec(tagged_documents):
    model = Doc2Vec(vector_size=100, window=5, min_count=1, epochs=200,
                    dm=0)  # dm=0 for distributed bag of words (DBOW) mode
    model.build_vocab(tagged_documents)
    model.train(tagged_documents, total_examples=model.corpus_count, epochs=model.epochs)
    return model

In [55]:
# Training the Doc2Vec model
model = train_doc2vec(tagged_originals + tagged_plagiarized)

### Función para calcular la similitud entre los conjuntos creando vectores con el modelo

In [56]:
def calculate_similarity_doc2vec(doc1, doc2, model):
    vec1 = model.infer_vector(doc1.words)
    vec2 = model.infer_vector(doc2.words)
    similarity = model.dv.similarity(doc1.tags[0], doc2.tags[0])
    return similarity

In [57]:
similarity_results = []
#plagiarism_type = '' 

# Iterating over each plagiarized text
for plagio_doc in tagged_plagiarized:
    max_similarity = 0
    most_similar = ''
    most_similar_doc = ''

    # Comparing with each original document
    for original_doc in tagged_originals:
        similarity = calculate_similarity_doc2vec(plagio_doc, original_doc, model)
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar = original_doc.tags[0]
            most_similar_doc = original_doc.words

    similarity_results.append([plagio_doc.tags[0], most_similar, max_similarity, most_similar_doc])

        

# Sorting results by similarity in descending order
similarity_results.sort(key=lambda x: x[2], reverse=True)

# Printing results
for result in similarity_results:
    plagio_title, original_title, similarity_score, original_doc = result
    print(f"Similarity between '{plagio_title}' and '{original_title}': {similarity_score * 100:.2f}%")

Similarity between 'FID-04.txt' and 'org-045.txt': 99.76%
Similarity between 'FID-03.txt' and 'org-016.txt': 99.52%
Similarity between 'FID-05.txt' and 'org-085.txt': 96.03%
Similarity between 'FID-09.txt' and 'org-109.txt': 93.64%
Similarity between 'FID-08.txt' and 'org-079.txt': 92.34%
Similarity between 'FID-06.txt' and 'org-043.txt': 92.03%
Similarity between 'FID-07.txt' and 'org-041.txt': 88.40%
Similarity between 'FID-02.txt' and 'org-104.txt': 87.87%
Similarity between 'FID-10.txt' and 'org-007.txt': 87.18%
Similarity between 'FID-01.txt' and 'org-076.txt': 77.71%


# PROCESO POR ORACION 
(gala)

In [40]:
# Training the Doc2Vec model
model = train_doc2vec(tagged_originals + tagged_plagiarized)

In [58]:
def similarity_doc2vec_sentences(doc1, doc2, model):
    vec1 = model.infer_vector(doc1.words)
    vec2 = model.infer_vector(doc2.words)
    similarity = model.dv.similarity(doc1.tags[0], doc2.tags[0])
    return similarity

In [91]:
def buscar_y_tokenizar(directorio, nombre_archivo):
    """
    Esta función recibe la ruta de un directorio y el nombre de un archivo
    y retorna una lista de oraciones tokenizadas del archivo
    ejemplo de salida: ['Primera oración.', 'Segunda oración.']
    """
    for filename in os.listdir(directorio):
        if filename == nombre_archivo:
            filepath = os.path.join(directorio, filename)
            with open(filepath, 'r', encoding='latin1', errors='ignore') as file:
                text = file.read()
                print("TEXT",text)
                #text = get_lemmatizer(text)
                sentences = nltk.sent_tokenize(text)
                print("ORACIONES: ",sentences)
                return sentences
    return None

In [92]:

def encontrar_coincidencias(sentences_originales, sentences_plagiados, model):
    coincidencias = []
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for sentence_orig in sentences_originales:
        for sentence_plag in sentences_plagiados:
            similarity = calculate_similarity_doc2vec(sentence_orig, sentence_plag, model)
            if similarity > 0.8:
                coincidencias.append({
                    "cadena_orig": sentence_orig,
                    "cadena_plag": sentence_plag,
                    "similitud": similarity
                })
                if sentence_orig == sentence_plag:
                    TP += 1
                else:
                    FP += 1
            else:
                if sentence_orig not in sentences_plagiados:
                    TN += 1
                else:
                    FN += 1

    matriz_auc = {'TP': TP, 'FP': FP, 'TN': TN, 'FN': FN}
    print(matriz_auc)
    return coincidencias, matriz_auc

In [88]:
def preprocess_sentences(sentences):
    tagged_sentences = []
    for i, sentence in enumerate(sentences):
        tagged_sentence = TaggedDocument(words=nltk.word_tokenize(sentence.lower()), tags=[str(i)])
        tagged_sentences.append(tagged_sentence)
    return tagged_sentences

In [93]:

total_coincidencias = []
total_TP = 0
total_FP = 0
total_TN = 0
total_FN = 0

for titulo in similarity_results:
    resultados = []
    sentences_originales = buscar_y_tokenizar(folder_path_og, titulo[1])
    sentences_plagiados = buscar_y_tokenizar(folder_path, titulo[0])
    print(f"Titulo: {titulo[0]}")

    if sentences_originales and sentences_plagiados:
        tagged_sentences_originales = preprocess_sentences(sentences_originales)
        tagged_sentences_plagiados = preprocess_sentences(sentences_plagiados)
        model = train_doc2vec(tagged_sentences_originales + tagged_sentences_plagiados)
        similitud = titulo[2]
        print(f"Similitud entre '{titulo[0]}' y '{titulo[1]}': {similitud * 100:.2f}%")
        coincidencias, matriz_auc = encontrar_coincidencias(tagged_sentences_originales, tagged_sentences_plagiados, model)
        total_TP += matriz_auc['TP']
        total_FP += matriz_auc['FP']
        total_TN += matriz_auc['TN']
        total_FN += matriz_auc['FN']
        total_coincidencias.extend(coincidencias)

        print(f"Coincidencias para '{titulo[0]}' y '{titulo[1]}':")
        for coincidencia in coincidencias:
            print(f"Cadena original: {coincidencia['cadena_orig']} (Similitud: {coincidencia['similitud']})")
            print(f"Cadena plagiada: {coincidencia['cadena_plag']}")
            print()
    else:
        print(f"No se encontraron oraciones en '{titulo[0]}' o '{titulo[1]}'")
        print()
    print("----------------------------\n")

# Calculando TPR, FPR y AUC
TPR = total_TP / (total_TP + total_FN) if (total_TP + total_FN) != 0 else 0
FPR = total_FP / (total_FP + total_TN) if (total_FP + total_TN) != 0 else 0
AUC = (1 + TPR - FPR) / 2

# Imprimiendo los valores calculados
print(f"TPR (Tasa de Verdaderos Positivos): {TPR:.2f}")
print(f"FPR (Tasa de Falsos Positivos): {FPR:.2f}")
print(f"AUC (Área bajo la curva ROC): {AUC:.2f}")


TEXT Interactive software agents, such as chatbots, are progressively being used in the area of health and well-being. In such applications, where agents engage with users in interpersonal conversations for, e.g., coaching, comfort or behavior-change interventions, there is an increased need for understanding agentsâ empathic capabilities. In the current state-of-the-art, there are no tools to do that. In order to understand empathic capabilities in interactive software agents, we need a precise notion of empathy. The literature discusses a variety of definitions of empathy, but there is no consensus of a formal definition. Based on a systematic literature review and a qualitative analysis of recent approaches to empathy in interactive agents for health and well-being, a formal definitionâan ontologyâof empathy is developed. We present the potential of the formal definition in a controlled user-study by applying it as a tool for assessing empathy in two state-of-the-art health an