In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
import os
import difflib

from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.util import ngrams
import gensim.downloader as api
import re

In [287]:
from gensim.models import FastText
import nltk
import os
from nltk.util import ngrams
import re

# Descargar recursos de NLTK si no los tienes
nltk.download('stopwords')
nltk.download('wordnet')

# Inicializar el lematizador y el stemmer de NLTK
lemmatizer = nltk.stem.WordNetLemmatizer()
lancStemmer = nltk.stem.LancasterStemmer()

# Función para eliminar stopwords
def remove_stopwords(text):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    palabras = [palabra.lower() for palabra in re.findall(r'\w+', text.lower())]
    text_lista = []
    for palabra in palabras:
        if palabra not in stopwords:
            text_lista.append(palabra)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

# Función para lematizar texto
def get_lemmatizer(text):
    palabras = remove_stopwords(text)
    palabras = palabras.split()
    text_lista = []
    for palabra in palabras:
        nueva = lemmatizer.lemmatize(palabra)
        text_lista.append(nueva)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

# Función para stemming
def get_stemmer(text):
    palabras = remove_stopwords(text)
    palabras = palabras.split()
    text_lista = []
    for palabra in palabras:
        nueva = lancStemmer.stem(palabra)
        text_lista.append(nueva)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

# Función para obtener n-gramas
def get_grams(text, ngram, method):
    if method == 'lemmatize':
        text = get_lemmatizer(text)
    elif method == 'stemmer':
        text = get_stemmer(text)
    else:
        raise ValueError('Method not found')
        
    text = text.split() 
    if ngram == 0:
        raise ValueError('ngram must be greater than 0 and less than 3')
    else:
        grams = ngrams(text, ngram)  
        result = []
        for ng in grams:
            result.append(' '.join(ng))
        return result

# Función para preprocesar los documentos
def preprocess_documents(folder_path, ngram, method):
    documents = []
    for fileid in os.listdir(folder_path):
        if fileid.endswith(".txt"):
            filepath = os.path.join(folder_path, fileid)
            with open(filepath, 'r', encoding='latin1', errors='ignore') as file:
                text = file.read()
                grams = get_grams(text, ngram, method)
                documents.append(grams)
    return documents

# Entrenamiento del modelo FastText
def train_fasttext_model(documents, vector_size=100, window=5, min_count=1, epochs=10):
    model = FastText(vector_size=vector_size, window=window, min_count=min_count)
    model.build_vocab(corpus_iterable=documents)  # Pasar documentos como corpus_iterable
    model.train(corpus_iterable=documents, total_examples=len(documents), epochs=epochs)
    return model

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sergiogonzalez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sergiogonzalez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:

# Ruta de los documentos originales
folder_path_originales = "../../docs_originales"
# Preprocesar los documentos originales
documents_originales = preprocess_documents(folder_path_originales, ngram=3, method='lemmatize')

In [289]:
fasttext_model = train_fasttext_model(documents_originales)

In [322]:
# Calcular embeddings para n-gramas
def calculate_embeddings(text, model):
    embeddings = []
    for word in text:
        if word in model.wv.key_to_index:
            embeddings.append(model.wv[word])
        else:
            # Si la palabra no está en el vocabulario del modelo, generamos un vector de ceros
            embeddings.append(np.zeros(model.vector_size))
    return embeddings

# Calcular la similitud de coseno entre los documentos plagiados y originales
def calculate_similarity(embeddings_plagiado, embeddings_original):
    similitud = cosine_similarity(embeddings_plagiado, embeddings_original)
    return similitud[0][0]

In [323]:
folder_path_plagio= "../../textos_plagiados"
preprocessed_text_plagiado = preprocess_documents(folder_path_plagio, ngram=3, method='lemmatize')



In [326]:

# Lista para almacenar las similitudes ordenadas
resultados = []
folder_path_plagio= "../../textos_plagiados" 
# Preprocesar el texto plagiado con los mismos argumentos que los documentos originales
preprocessed_text_plagiado = preprocess_documents(folder_path_plagio, ngram=3, method='lemmatize')
embeddings_plagiado = calculate_embeddings(preprocessed_text_plagiado, fasttext_model)

# Preprocesar el texto original con los mismos argumentos que los documentos originales

embeddings_original = calculate_embeddings(preprocessed_text_original, fasttext_model)

# Calcular la similitud de coseno entre los documentos
similitud = calculate_similarity(embeddings_plagiado, embeddings_original)
if similitud >= 0.2:  # Ajusta el umbral de similitud según tus necesidades
    resultados.append([fileid_plagiado, fileid_original, similitud])

# Ordenar los resultados por similitud descendente
resultados.sort(key=lambda x: x[2], reverse=True)

# Imprimir resultados
print('Similitud por documento: \n')
for resultado in resultados:
    print(f"{resultado[0]} - {resultado[1]}: {resultado[2]}")

['study conducted investigate', 'conducted investigate empathy', 'investigate empathy human', 'empathy human chatbot', 'human chatbot interaction', 'chatbot interaction among', 'interaction among computer', 'among computer science', 'computer science student', 'science student uppsala', 'student uppsala university', 'uppsala university sweden', 'university sweden done', 'sweden done exploring', 'done exploring participant', 'exploring participant perceived', 'participant perceived anthropomorphic', 'perceived anthropomorphic chatbots', 'anthropomorphic chatbots machine', 'chatbots machine human', 'machine human existence', 'human existence verbal', 'existence verbal abuse', 'verbal abuse human', 'abuse human chatbot', 'human chatbot interaction', 'chatbot interaction expectation', 'interaction expectation chatbot', 'expectation chatbot helpfulness', 'chatbot helpfulness depending', 'helpfulness depending gender', 'depending gender dynamic', 'gender dynamic semi', 'dynamic semi structur

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()