### Librerias

In [583]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
import os
import difflib
from gensim.models import Word2Vec
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.util import ngrams
import gensim.downloader as api
import re
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sergiogonzalez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sergiogonzalez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [584]:
lemmatizer = WordNetLemmatizer() #lemmatizer algorithm
lancStemmer = LancasterStemmer()  # stemming algorithm Lancaster

### Funciones de preprocesamiento a nivel de palabras

In [585]:
def remove_stopwords(text):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    palabras = nltk.word_tokenize(text)
    text_lista = []
    for palabra in palabras:
        if palabra not in stopwords:
            text_lista.append(palabra)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

In [586]:
def get_lemmatizer(text):
    palabras = remove_stopwords(text)
    palabras = palabras.split()
    text_lista = []
    for palabra in palabras:
        nueva = lemmatizer.lemmatize(palabra)
        text_lista.append(nueva)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

In [587]:
def get_stemmer(text):
    palabras = remove_stopwords(text)
    palabras = palabras.split()
    text_lista = []
    for palabra in palabras:
        nueva = lancStemmer.stem(palabra)
        text_lista.append(nueva)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

In [627]:
def get_grams(text, ngram, method):
    result = []
    if method == 'lemmatize':
        text = get_lemmatizer(text)
    elif method == 'stemmer':
        text = get_stemmer(text)
    else:
        raise ValueError('Method not found')
        
    if ngram == 0: # Si ngram es 0, se retorna el texto completo sin ngramas
        text = nltk.sent_tokenize(text)
        text = ' '.join(text)
        return text
        
    else:
        text = text.split() 
        grams = ngrams(text, ngram)  
        for ng in grams:
            result.append(' '.join(ng))
        return result


In [589]:
def token_sentence(text):
    sentences = nltk.sent_tokenize(text)
    filtered_sentences = []
    for sentence in sentences:
        filtered_words = remove_stopwords(sentence)
        filtered_sentences.append(filtered_words)

    return filtered_sentences

In [590]:
# FUNCION QUE INCLUYE EL USO DE NGRAMAS 
def pre_process(folder_path, ngram, method):
    texto_preprocesado = []
    for fileid in os.listdir(folder_path):
        if fileid.endswith(".txt"):
            filepath = os.path.join(folder_path, fileid)
            with open(filepath, 'r', encoding='latin1', errors='ignore') as file:
                text = file.read()
                grams = get_grams(text, ngram, method)
                sentences = token_sentence(text)
                texto_preprocesado.append((fileid, grams, sentences))
    
        
    return texto_preprocesado

In [591]:
def make_matrix(text_1, text_2):
    text_set = set(text_1, text_2)
    text_list = [text_1, text_2]
    matrix = []

    for text_item in text_list:
        vector = []
        for word in text_set:
          vector.append(1 if word in text_item else 0) #compara las palabras de los grams a la palabra y agrega o un 1 o un 0 al vector del parrafo
        matrix.append(vector)
    return matrix

### Llamado a las funciones 

In [644]:
# Obtener n-gramas preprocesados
folder_path = "../../textos_plagiados"  # Ruta de la carpeta con los textos plagiados
preprocess_plagiados = pre_process(folder_path, 1, 'lemmatize')

folder_path_og = "../../docs_originales"  # Ruta de la carpeta con los textos originales
preprocess_originales = pre_process(folder_path_og, 1, 'lemmatize')

In [645]:
# Para sentences_originales y sentences_plagiados
sentences_originales = [(text[0], text[2]) for text in preprocess_originales]
sentences_plagiados = [(text[0], text[2]) for text in preprocess_plagiados]

In [646]:
sentences_plagiados

[('FID-06.txt',
  ['This study conducted investigate empathy human-chatbot interactions among computer science students Uppsala University , Sweden .',
   'This done exploring participants perceived anthropomorphic chatbots machines humans , existence verbal abuse human-chatbot interactions , expectation chatbot helpfulness depending gender dynamics .',
   'A semi-structured interview methodology five students conducted qualitative data collection .',
   'The collected data manually analyzed using thematic analysis .',
   'The results study find empathy human-chatbot interaction , regardless whether participants perceive anthropomorphic chatbots humans machines .',
   'However , level empathy generally low participants become frustrated dissatisfied response chatbots exit chatbots without expressing frustration .',
   'They usually forgot frustration came questions another time .',
   'The study also showed participants may expect help politeness chatbots likely female .']),
 ('FID-07.

In [647]:
# Para preprocess_plagiados y preprocess_originales
preprocess_plagiados = [(text[0], text[1]) for text in preprocess_plagiados]
preprocess_originales = [(text[0], text[1]) for text in preprocess_originales]

In [648]:
preprocess_plagiados

[('FID-06.txt',
  ['This',
   'study',
   'conducted',
   'investigate',
   'empathy',
   'human-chatbot',
   'interaction',
   'among',
   'computer',
   'science',
   'student',
   'Uppsala',
   'University',
   ',',
   'Sweden',
   '.',
   'This',
   'done',
   'exploring',
   'participant',
   'perceived',
   'anthropomorphic',
   'chatbots',
   'machine',
   'human',
   ',',
   'existence',
   'verbal',
   'abuse',
   'human-chatbot',
   'interaction',
   ',',
   'expectation',
   'chatbot',
   'helpfulness',
   'depending',
   'gender',
   'dynamic',
   '.',
   'A',
   'semi-structured',
   'interview',
   'methodology',
   'five',
   'student',
   'conducted',
   'qualitative',
   'data',
   'collection',
   '.',
   'The',
   'collected',
   'data',
   'manually',
   'analyzed',
   'using',
   'thematic',
   'analysis',
   '.',
   'The',
   'result',
   'study',
   'find',
   'empathy',
   'human-chatbot',
   'interaction',
   ',',
   'regardless',
   'whether',
   'participant'

### Embeddings

In [649]:
from gensim.models import FastText

In [651]:
# Entrena un solo modelo word2vec con todos los textos
training_data = [text[1] for text in sentences_originales] + [text[1] for text in sentences_plagiados] + [text[1] for text in preprocess_originales] + [text[1] for text in preprocess_plagiados]

In [652]:
training_data

[['Adaptation innovation extremely important manufacturing industry .',
  'This development lead sustainable manufacturing using new technologies .',
  'To promote sustainability , smart production requires global perspectives smart production application technology .',
  'In regard , thanks intensive research efforts field artificial intelligence ( AI ) , number AI-based techniques , machine learning , already established industry achieve sustainable manufacturing .',
  'Thus , aim present research analyze , systematically , scientific literature relating application artificial intelligence machine learning ( ML ) industry .',
  'In fact , introduction Industry 4.0 , artificial intelligence machine learning considered driving force smart factory revolution .',
  'The purpose review classify literature , including publication year , authors , scientific sector , country , institution , keywords .',
  'The analysis done using Web Science SCOPUS database .',
  'Furthermore , UCINET NVivo

In [653]:
model = FastText(training_data, vector_size=100, window=5, min_count=1, epochs=200, sg=1)

In [599]:
# Guarda el modelo entrenado
#model.save("word2vec_model.bin")
# Cargar el modelo Word2Vec entrenado
# model = Word2Vec.load("word2vec_model.bin")

In [654]:
from sklearn.metrics.pairwise import cosine_similarity

# Función para calcular la similitud entre dos textos utilizando Word2Vec
def calculate_similarity(text1, text2, model):
    text1_vec = [model.wv.get_vector(word) for word in text1 if word in model.wv.key_to_index]
    text2_vec = [model.wv.get_vector(word) for word in text2 if word in model.wv.key_to_index]
    if text1_vec and text2_vec:
        similarity = cosine_similarity([np.mean(text1_vec, axis=0)], [np.mean(text2_vec, axis=0)])[0][0]
        return similarity
    else:
        return 0.0

In [655]:
def clasificar_plagio(plagio_text, original_text, model, umbral):
    similarity = calculate_similarity(plagio_text[1], original_text[1], model)
    if similarity >= umbral:
        return "Plagio"
    else:
        return "No Plagio"



In [656]:
# Lista para almacenar los resultados de similitud
similarity_results = []

# Iterar sobre cada texto con plagio
for plagio_text in preprocess_plagiados:
    max_similarity = 0
    most_similar = ''
    most_similar_text = ''

    # Comparar con cada texto original
    for original_text in preprocess_originales:
        similarity = calculate_similarity(plagio_text[1], original_text[1], model)  # Utilizar el modelo Word2Vec
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar = original_text[0]
            most_similar_text = original_text[1]

    # Almacenar el resultado de similitud
    similarity_results.append([plagio_text[0], most_similar, max_similarity, most_similar_text])

# Ordenar los resultados por similitud de manera descendente
similarity_results.sort(key=lambda x: x[2], reverse=True)

# Imprimir los resultados
for result in similarity_results:
    plagio_title, original_title, similarity_score, original_text = result

    # Porcentaje de similitud
    similarity_percent = f"{similarity_score * 100:.2f}%"

    # Imprimir título y similitud
    print(f"Titulo: {plagio_title}")
    print(f"Similitud entre '{plagio_title}' y '{original_title}': {similarity_percent}")

    # Imprimir coincidencias entre los textos
    print("Coincidencias para '{plagio_title}' y '{original_title}':")
    print("----------------------------")

    # Imprimir cada par de oraciones coincidentes
    for original_sentence, plagio_sentence in zip(original_text, plagio_text[1]):
        print(f"Cadena original: {original_sentence.strip()} (Longitud: {len(original_sentence.strip())})")
        print(f"Cadena plagiada: {plagio_sentence.strip()} (Longitud: {len(plagio_sentence.strip())})")
        print("----------------------------")

    print("\n")

umbral_plagio = 0.2

Titulo: FID-04.txt
Similitud entre 'FID-04.txt' y 'org-045.txt': 100.00%
Coincidencias para '{plagio_title}' y '{original_title}':
----------------------------
Cadena original: Interactive (Longitud: 11)
Cadena plagiada: ï (Longitud: 1)
----------------------------
Cadena original: software (Longitud: 8)
Cadena plagiada: » (Longitud: 1)
----------------------------
Cadena original: agent (Longitud: 5)
Cadena plagiada: ¿The (Longitud: 4)
----------------------------
Cadena original: , (Longitud: 1)
Cadena plagiada: main (Longitud: 4)
----------------------------
Cadena original: chatbots (Longitud: 8)
Cadena plagiada: idea (Longitud: 4)
----------------------------
Cadena original: , (Longitud: 1)
Cadena plagiada: paper (Longitud: 5)
----------------------------
Cadena original: progressively (Longitud: 13)
Cadena plagiada: substantiation (Longitud: 14)
----------------------------
Cadena original: used (Longitud: 4)
Cadena plagiada: methodological (Longitud: 14)
-----------------------

In [657]:
# Calcular métricas de evaluación
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Lista para almacenar las etiquetas predichas por el modelo
y_pred = []

# Lista para almacenar las etiquetas verdaderas
y_true = []

# Iterar sobre cada texto con plagio
for plagio_text in preprocess_plagiados:
    max_similarity = 0
    most_similar = ''
    most_similar_text = ''

    # Comparar con cada texto original
    for original_text in preprocess_originales:
        similarity = calculate_similarity(plagio_text[1], original_text[1], model)  # Utilizar el modelo Word2Vec
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar = original_text[0]
            most_similar_text = original_text[1]

    # Determinar si es un caso de plagio o no
    if max_similarity >= umbral_plagio:
        y_pred.append(1)  # Clasificar como plagio
    else:
        y_pred.append(0)  # Clasificar como no plagio

    # Almacenar la etiqueta verdadera
    if plagio_text[0] in [text[0] for text in preprocess_originales]:
        y_true.append(1)  # El texto es un caso de plagio
    else:
        y_true.append(0)  # El texto no es un caso de plagio

# Calcular las métricas de evaluación
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Accuracy: 0.0
Precision: 0.0
Recall: 0.0
F1-score: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
