### Librerias

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
import os
import difflib
from gensim.models import Word2Vec
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.util import ngrams
import gensim.downloader as api
import re
nltk.download('punkt')

In [None]:
lemmatizer = WordNetLemmatizer() #lemmatizer algorithm
lancStemmer = LancasterStemmer()  # stemming algorithm Lancaster

### Funciones de preprocesamiento a nivel de palabras

In [215]:
def remove_stopwords(text):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    palabras = [palabra.lower() for palabra in re.findall(r'\w+', text.lower())]
    text_lista = []
    for palabra in palabras:
        if palabra not in stopwords:
            text_lista.append(palabra)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

In [None]:
def get_lemmatizer(text):
    palabras = remove_stopwords(text)
    palabras = palabras.split()
    text_lista = []
    for palabra in palabras:
        nueva = lemmatizer.lemmatize(palabra)
        text_lista.append(nueva)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

In [None]:
def get_stemmer(text):
    palabras = remove_stopwords(text)
    palabras = palabras.split()
    text_lista = []
    for palabra in palabras:
        nueva = lancStemmer.stem(palabra)
        text_lista.append(nueva)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

In [205]:
def get_grams(text, ngram, method):
    result = []
    
    if method == 'lemmatize':
        text = get_lemmatizer(text)
        if ngram == 0: # Si ngram es 0, se retorna el texto completo sin ngramas
            text = nltk.sent_tokenize(text)
            text = ' '.join(text)
            return text
            
        else:
            text = text.split() 
            grams = ngrams(text, ngram)  
            for ng in grams:
                result.append(' '.join(ng))
    elif method == 'stemmer':
        text = get_stemmer(text)
        if ngram == 0: # Si ngram es 0, se retorna el texto completo sin ngramas
            text = nltk.sent_tokenize(text)
            text = ' '.join(text)
            return text
            
        else:
            text = text.split() 
            grams = ngrams(text, ngram)  
            for ng in grams:
                result.append(' '.join(ng))
    else:
        raise ValueError('Method not found')
        

    return result


In [206]:
def token_sentence(text):
    sentences = nltk.sent_tokenize(text)
    filtered_sentences = []
    for sentence in sentences:
        filtered_words = remove_stopwords(sentence)
        filtered_sentences.append(filtered_words)

    return filtered_sentences

In [207]:
# FUNCION QUE INCLUYE EL USO DE NGRAMAS 
def pre_process(folder_path, ngram, method):
    texto_preprocesado = []
    for fileid in os.listdir(folder_path):
        if fileid.endswith(".txt"):
            filepath = os.path.join(folder_path, fileid)
            with open(filepath, 'r', encoding='latin1', errors='ignore') as file:
                text = file.read()
                grams = get_grams(text, ngram, method)
                texto_preprocesado.append((fileid, grams))
    
        
    return texto_preprocesado

In [None]:
def make_matrix(text_1, text_2):
    text_set = set(text_1, text_2)
    text_list = [text_1, text_2]
    matrix = []

    for text_item in text_list:
        vector = []
        for word in text_set:
          vector.append(1 if word in text_item else 0) #compara las palabras de los grams a la palabra y agrega o un 1 o un 0 al vector del parrafo
        matrix.append(vector)
    return matrix

### Llamado a las funciones 

In [120]:
# Obtener n-gramas preprocesados
folder_path = "../../textos_plagiados"  # Ruta de la carpeta con los textos plagiados
preprocess_plagiados = pre_process(folder_path, 3, 'lemmatize')

folder_path_og = "../../docs_originales"  # Ruta de la carpeta con los textos originales
preprocess_originales = pre_process(folder_path_og, 3, 'lemmatize')

### Embeddings

In [121]:
from gensim.models import FastText

In [122]:
# Entrena un solo modelo
training_data = [text[1] for text in preprocess_originales] + [text[1] for text in preprocess_plagiados]

In [123]:
model = FastText(training_data, vector_size=100, window=5, min_count=1, epochs=200, sg=1)

In [131]:
#Guarda el modelo entrenado
model.save("word2vec_model.bin")
# Cargar el modelo Word2Vec entrenado
model = Word2Vec.load("word2vec_model.bin")

In [132]:
from sklearn.metrics.pairwise import cosine_similarity
def calculate_similarity(text1, text2, model):
    text1_vec = [model.wv.get_vector(word) for word in text1 if word in model.wv.key_to_index]
    text2_vec = [model.wv.get_vector(word) for word in text2 if word in model.wv.key_to_index]
    if text1_vec and text2_vec:
        similarity = cosine_similarity([np.mean(text1_vec, axis=0)], [np.mean(text2_vec, axis=0)])[0][0]
        return similarity
    else:
        return 0.0

In [136]:
# Lista para almacenar los resultados de similitud
similarity_results = []

# Iterar sobre cada texto con plagio
for plagio_text in preprocess_plagiados:
    max_similarity = 0
    most_similar = ''
    most_similar_text = ''

    # Comparar con cada texto original
    for original_text in preprocess_originales:
        similarity = calculate_similarity(plagio_text[1], original_text[1], model)  # Utilizar el modelo Word2Vec
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar = original_text[0]
            most_similar_text = original_text[1]

    # Almacenar el resultado de similitud
    similarity_results.append([plagio_text[0],most_similar, max_similarity])

# Ordenar los resultados por similitud de manera descendente
similarity_results.sort(key=lambda x: x[2], reverse=True)

In [137]:
similarity_results

[['FID-04.txt', 'org-045.txt', 0.99995315],
 ['FID-05.txt', 'org-085.txt', 0.9998287],
 ['FID-09.txt', 'org-109.txt', 0.99917597],
 ['FID-06.txt', 'org-043.txt', 0.9984729],
 ['FID-03.txt', 'org-016.txt', 0.9981569],
 ['FID-08.txt', 'org-079.txt', 0.99787873],
 ['FID-07.txt', 'org-041.txt', 0.99684525],
 ['FID-10.txt', 'org-007.txt', 0.99077153],
 ['FID-01.txt', 'org-076.txt', 0.9730175],
 ['FID-02.txt', 'org-104.txt', 0.9608251]]

### DOC 2 VEC

In [185]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [221]:
# Preprocessing function for documents
def preprocess_docs(folder_path):
    """
    Preprocesses the documents in the given folder path.
    """
    tagged_documents = []
    for fileid in os.listdir(folder_path):
        if fileid.endswith(".txt"):
            filepath = os.path.join(folder_path, fileid)
            with open(filepath, 'r', encoding='latin1', errors='ignore') as file:
                text = file.read()
                tagged_documents.append(TaggedDocument(words=text.split(), tags=[fileid]))
    print(tagged_documents)
    return tagged_documents

In [222]:
# Training Doc2Vec model
def train_doc2vec(tagged_documents):
    """
    Trains the Doc2Vec model with the given tagged documents.
    """
    model = Doc2Vec(vector_size=100, window=2, min_count=1, epochs=100, dm=0)  # dm=0 for distributed bag of words (DBOW) mode
    model.build_vocab(tagged_documents)
    model.train(tagged_documents, total_examples=model.corpus_count, epochs=model.epochs)
    return model

In [223]:
# Function to calculate similarity between documents using Doc2Vec embeddings
def calculate_similarity_doc2vec(doc1, doc2, model):
    """
    Calculates the similarity between two documents using Doc2Vec embeddings.
    """
    vec1 = model.infer_vector(doc1.words)
    vec2 = model.infer_vector(doc2.words)
    similarity = model.dv.similarity(doc1.tags[0], doc2.tags[0])
    return similarity

In [224]:
# Obtener n-gramas preprocesados
folder_path = "../../textos_plagiados"  # Ruta de la carpeta con los textos plagiados
preprocess_plagiados = pre_process(folder_path, 0, 'lemmatize')

folder_path_og = "../../docs_originales"  # Ruta de la carpeta con los textos originales
preprocess_originales = pre_process(folder_path_og, 0, 'lemmatize')

In [237]:
# Preprocessing original and plagiarized documents
tagged_originals_2 = preprocess_originales
tagged_plagiarized_2 = preprocess_plagiados

In [242]:
tagged_originales = preprocess_docs(folder_path_og)
tagged_plagiados = preprocess_docs(folder_path)

[TaggedDocument(words=['Adaptation', 'and', 'innovation', 'are', 'extremely', 'important', 'to', 'the', 'manufacturing', 'industry.', 'This', 'development', 'should', 'lead', 'to', 'sustainable', 'manufacturing', 'using', 'new', 'technologies.', 'To', 'promote', 'sustainability,', 'smart', 'production', 'requires', 'global', 'perspectives', 'of', 'smart', 'production', 'application', 'technology.', 'In', 'this', 'regard,', 'thanks', 'to', 'intensive', 'research', 'efforts', 'in', 'the', 'field', 'of', 'artificial', 'intelligence', '(AI),', 'a', 'number', 'of', 'AI-based', 'techniques,', 'such', 'as', 'machine', 'learning,', 'have', 'already', 'been', 'established', 'in', 'the', 'industry', 'to', 'achieve', 'sustainable', 'manufacturing.', 'Thus,', 'the', 'aim', 'of', 'the', 'present', 'research', 'was', 'to', 'analyze,', 'systematically,', 'the', 'scientific', 'literature', 'relating', 'to', 'the', 'application', 'of', 'artificial', 'intelligence', 'and', 'machine', 'learning', '(ML)',

In [238]:
print(type(tagged_originals_2 + tagged_plagiarized_2))

<class 'list'>


In [240]:
type(tagged_originales + tagged_plagiados)

list

In [243]:
# Training the Doc2Vec model
model = train_doc2vec(tagged_originals + tagged_plagiarized)

AttributeError: 'tuple' object has no attribute 'words'

In [225]:
# List to store similarity results
similarity_results = []

# Iterating over each plagiarized text
for plagio_doc in tagged_plagiados:
    max_similarity = 0
    most_similar = ''
    most_similar_doc = ''

    # Comparing with each original document
    for original_doc in tagged_originals:
        similarity = calculate_similarity_doc2vec(plagio_doc, original_doc, model)
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar = original_doc.tags[0]
            most_similar_doc = original_doc.words

    similarity_results.append([plagio_doc.tags[0], most_similar, max_similarity, most_similar_doc])
    
# Sorting results by similarity in descending order
similarity_results.sort(key=lambda x: x[2], reverse=True)

# Printing results
for result in similarity_results:
    plagio_title, original_title, similarity_score, original_doc = result
    print(f"Similarity between '{plagio_title}' and '{original_title}': {similarity_score * 100:.2f}%")

AttributeError: 'tuple' object has no attribute 'words'

### other

In [107]:
!pip install tensorflow-hub

Collecting tensorflow-hub
  Downloading tensorflow_hub-0.16.1-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading tensorflow_hub-0.16.1-py2.py3-none-any.whl (30 kB)
Installing collected packages: tensorflow-hub
Successfully installed tensorflow-hub-0.16.1


In [111]:
import os
import numpy as np
import tensorflow_hub as hub
from sklearn.metrics.pairwise import cosine_similarity

# Function to read text from file
def read_text_from_file(file_path):
    with open(file_path, 'r', encoding='latin1', errors='ignore') as file:
        text = file.read()
        return text

# Function to calculate similarity using Universal Sentence Encoder
def calculate_similarity(text1, text2, model):
    embeddings = model([text1, text2])
    similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return similarity

# Function to compare texts in two folders
def compare_texts(folder_a, folder_b):
    # Load Universal Sentence Encoder model
    use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

    similarities = []
    for file_a in os.listdir(folder_a):
        if file_a.endswith(".txt"):
            text_a = read_text_from_file(os.path.join(folder_a, file_a))
            for file_b in os.listdir(folder_b):
                if file_b.endswith(".txt"):
                    text_b = read_text_from_file(os.path.join(folder_b, file_b))
                    similarity = calculate_similarity(text_a, text_b, use_model)
                    similarities.append((file_a, file_b, similarity))
    return similarities

# Example usage
folder_a = "../../textos_plagiados"
folder_b = "../../docs_originales"
similarities = compare_texts(folder_a, folder_b)
resultados = []
for similarity in similarities:
           resultados.append([similarity[0], similarity[1], similarity[2]])
    # print(f"Documento analizado: {similarity[0]}")
    # print(f"Similitud de Coseno entre {similarity[0]} y {similarity[1]}: {similarity[2]}")

In [112]:
resultados.sort(key=lambda x: x[2], reverse=True) 

In [118]:
for resultado in resultados:
    if resultado[2] > 0.8:
        print(f"Similitud de Coseno entre {resultado[0]} y {resultado[1]}: {resultado[2]}")

Similitud de Coseno entre FID-04.txt y org-045.txt: 0.9990469813346863
Similitud de Coseno entre FID-03.txt y org-016.txt: 0.991265594959259
Similitud de Coseno entre FID-09.txt y org-109.txt: 0.9799057245254517
Similitud de Coseno entre FID-05.txt y org-085.txt: 0.9765050411224365
Similitud de Coseno entre FID-07.txt y org-041.txt: 0.9757723808288574
Similitud de Coseno entre FID-08.txt y org-079.txt: 0.9751248359680176
Similitud de Coseno entre FID-10.txt y org-007.txt: 0.9536784887313843
Similitud de Coseno entre FID-06.txt y org-043.txt: 0.9395915269851685
Similitud de Coseno entre FID-02.txt y org-104.txt: 0.8302781581878662
Similitud de Coseno entre FID-01.txt y org-076.txt: 0.8004879951477051
