In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
import os
import difflib

from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.util import ngrams
import gensim.downloader as api
import re

In [287]:
from gensim.models import FastText
import nltk
import os
from nltk.util import ngrams
import re

# Descargar recursos de NLTK si no los tienes
nltk.download('stopwords')
nltk.download('wordnet')

# Inicializar el lematizador y el stemmer de NLTK
lemmatizer = nltk.stem.WordNetLemmatizer()
lancStemmer = nltk.stem.LancasterStemmer()

# Función para eliminar stopwords
def remove_stopwords(text):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    palabras = [palabra.lower() for palabra in re.findall(r'\w+', text.lower())]
    text_lista = []
    for palabra in palabras:
        if palabra not in stopwords:
            text_lista.append(palabra)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

# Función para lematizar texto
def get_lemmatizer(text):
    palabras = remove_stopwords(text)
    palabras = palabras.split()
    text_lista = []
    for palabra in palabras:
        nueva = lemmatizer.lemmatize(palabra)
        text_lista.append(nueva)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

# Función para stemming
def get_stemmer(text):
    palabras = remove_stopwords(text)
    palabras = palabras.split()
    text_lista = []
    for palabra in palabras:
        nueva = lancStemmer.stem(palabra)
        text_lista.append(nueva)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

# Función para obtener n-gramas
def get_grams(text, ngram, method):
    if method == 'lemmatize':
        text = get_lemmatizer(text)
    elif method == 'stemmer':
        text = get_stemmer(text)
    else:
        raise ValueError('Method not found')
        
    text = text.split() 
    if ngram == 0:
        raise ValueError('ngram must be greater than 0 and less than 3')
    else:
        grams = ngrams(text, ngram)  
        result = []
        for ng in grams:
            result.append(' '.join(ng))
        return result

# Función para preprocesar los documentos
def preprocess_documents(folder_path, ngram, method):
    documents = []
    for fileid in os.listdir(folder_path):
        if fileid.endswith(".txt"):
            filepath = os.path.join(folder_path, fileid)
            with open(filepath, 'r', encoding='latin1', errors='ignore') as file:
                text = file.read()
                grams = get_grams(text, ngram, method)
                documents.append(grams)
    return documents


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sergiogonzalez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sergiogonzalez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [288]:
# Entrenamiento del modelo FastText
def train_fasttext_model(documents, vector_size=100, window=5, min_count=1, epochs=10):
    model = FastText(vector_size=vector_size, window=window, min_count=min_count)
    model.build_vocab(corpus_iterable=documents)  # Pasar documentos como corpus_iterable
    model.train(corpus_iterable=documents, total_examples=len(documents), epochs=epochs)
    return model

In [None]:
# Ruta de los documentos originales
folder_path_originales = "../../docs_originales"
# Preprocesar los documentos originales
documents_originales = preprocess_documents(folder_path_originales, ngram=3, method='lemmatize')

In [289]:
fasttext_model = train_fasttext_model(documents_originales)

In [322]:
# Calcular embeddings para n-gramas
def calculate_embeddings(text, model):
    embeddings = []
    for ngram in text:
        embedding_sum = np.zeros(model.vector_size)
        for word in ngram:
            if word in model.wv.key_to_index:
                embedding_sum += model.wv[word]
        embeddings.append(embedding_sum)
    return embeddings

# Calcular la similitud de coseno entre los documentos plagiados y originales
def calculate_similarity(embeddings_plagiado, embeddings_original):
    similitud_matrix = cosine_similarity(embeddings_plagiado, embeddings_original)
    return similitud_matrix

In [323]:
folder_path_plagio= "../../textos_plagiados"
preprocessed_text_plagiado = preprocess_documents(folder_path_plagio, ngram=3, method='lemmatize')

preprocessed_text_original = preprocess_documents(folder_path_originales, ngram=3, method='lemmatize')

In [318]:
preprocessed_text_plagiado

[['study conducted investigate',
  'conducted investigate empathy',
  'investigate empathy human',
  'empathy human chatbot',
  'human chatbot interaction',
  'chatbot interaction among',
  'interaction among computer',
  'among computer science',
  'computer science student',
  'science student uppsala',
  'student uppsala university',
  'uppsala university sweden',
  'university sweden done',
  'sweden done exploring',
  'done exploring participant',
  'exploring participant perceived',
  'participant perceived anthropomorphic',
  'perceived anthropomorphic chatbots',
  'anthropomorphic chatbots machine',
  'chatbots machine human',
  'machine human existence',
  'human existence verbal',
  'existence verbal abuse',
  'verbal abuse human',
  'abuse human chatbot',
  'human chatbot interaction',
  'chatbot interaction expectation',
  'interaction expectation chatbot',
  'expectation chatbot helpfulness',
  'chatbot helpfulness depending',
  'helpfulness depending gender',
  'depending

In [310]:
preprocessed_text_original

[['adaptation innovation extremely',
  'innovation extremely important',
  'extremely important manufacturing',
  'important manufacturing industry',
  'manufacturing industry development',
  'industry development lead',
  'development lead sustainable',
  'lead sustainable manufacturing',
  'sustainable manufacturing using',
  'manufacturing using new',
  'using new technology',
  'new technology promote',
  'technology promote sustainability',
  'promote sustainability smart',
  'sustainability smart production',
  'smart production requires',
  'production requires global',
  'requires global perspective',
  'global perspective smart',
  'perspective smart production',
  'smart production application',
  'production application technology',
  'application technology regard',
  'technology regard thanks',
  'regard thanks intensive',
  'thanks intensive research',
  'intensive research effort',
  'research effort field',
  'effort field artificial',
  'field artificial intelligence',

In [311]:
embeddings_plagiado = calculate_embeddings(preprocessed_text_plagiado, fasttext_model)

In [312]:
embeddings_original = calculate_embeddings(preprocessed_text_original, fasttext_model)

In [319]:
embeddings_original

[array([ -94.89772398,  213.89456579,  -58.79925359,   47.88545042,
          83.32572998,   26.0237124 ,  113.21138042,  -35.00550325,
         119.98949245,  -69.80400616,   44.20913488,   21.21712041,
          12.47476233,   21.99532885,   26.11861653,  -13.77828936,
          51.11326773,   12.68567459,  -74.65827288,  -44.1098161 ,
         -50.30192884, -100.15023065,   20.45107637,  -55.70095384,
         130.12029725,  -51.67612446,   46.5902219 ,  -66.24746177,
          94.59982046,   42.63908271,   31.60519397,   78.59391357,
           6.54674681,    7.93351418,  -37.25616769,  -67.15289953,
          42.59627968,   33.51958684,  -30.99470361,  -40.44394408,
          54.87123825,  -41.34284735,   24.67366991,   -0.70656919,
          27.43649987,  -21.07063955,  -87.1498497 ,   23.69026071,
          37.78512883,   -5.52074094,  -38.77830561,  -26.30602074,
           2.74436507,    6.19907265,   31.32149441,  -97.67263124,
        -128.63367255,  -51.71143173,  -26.69911

In [320]:
embeddings_plagiado

[array([-30.95646614,  69.75166878, -19.17500589,  15.60899881,
         27.17816669,   8.48315029,  36.88937971, -11.41182243,
         39.1542961 , -22.75152579,  14.43934335,   6.88879756,
          4.06438822,   7.16649427,   8.52087552,  -4.47831598,
         16.6348307 ,   4.14474182, -24.32745181, -14.4129338 ,
        -16.40828726, -32.63232763,   6.65358396, -18.15493737,
         42.42014177, -16.8626586 ,  15.17706662, -21.57767724,
         30.82793918,  13.8957016 ,  10.31515672,  25.6275105 ,
          2.12689342,   2.60282484, -12.13984454, -21.89688716,
         13.89193518,  10.92982851, -10.11115612, -13.16539605,
         17.86966511, -13.49081103,   8.04121738,  -0.2707657 ,
          8.96032433,  -6.87546501, -28.40529755,   7.72211267,
         12.30345032,  -1.79684916, -12.67046717,  -8.60922898,
          0.89887631,   1.99777119,  10.22909005, -31.86684039,
        -41.96019734, -16.84896952,  -8.70251876,  -7.79665904,
        -12.319367  ,  10.5914852 , -43.

In [325]:
from pprint import pprint

# Lista para almacenar las similitudes ordenadas
resultados = []

# Iterar sobre las listas y calcular la similitud de coseno
for grams_plagiado in preprocessed_text_plagiado:
    print(grams_plagiado)
    for grams_original in preprocessed_text_original:
        print(grams_original)
        similitud = calculate_similarity(embeddings_plagiado, embeddings_original)
        if similitud >= 0.2:  # Ajusta el umbral de similitud según tus necesidades
            resultados.append([grams_plagiado, grams_original, similitud])

# Ordenar los resultados por similitud descendente
resultados.sort(key=lambda x: x[2], reverse=True)

# Utilizar pprint en lugar de print para un output más ordenado
print('Similitud por documento: \n ')
pprint(resultados[2])

['study conducted investigate', 'conducted investigate empathy', 'investigate empathy human', 'empathy human chatbot', 'human chatbot interaction', 'chatbot interaction among', 'interaction among computer', 'among computer science', 'computer science student', 'science student uppsala', 'student uppsala university', 'uppsala university sweden', 'university sweden done', 'sweden done exploring', 'done exploring participant', 'exploring participant perceived', 'participant perceived anthropomorphic', 'perceived anthropomorphic chatbots', 'anthropomorphic chatbots machine', 'chatbots machine human', 'machine human existence', 'human existence verbal', 'existence verbal abuse', 'verbal abuse human', 'abuse human chatbot', 'human chatbot interaction', 'chatbot interaction expectation', 'interaction expectation chatbot', 'expectation chatbot helpfulness', 'chatbot helpfulness depending', 'helpfulness depending gender', 'depending gender dynamic', 'gender dynamic semi', 'dynamic semi structur

KeyboardInterrupt: 