### Librerias

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
import os
import difflib

from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.util import ngrams
import gensim.downloader as api
import re

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sergiogonzalez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
lemmatizer = WordNetLemmatizer() #lemmatizer algorithm
lancStemmer = LancasterStemmer()  # stemming algorithm Lancaster
word_vectors = api.load("glove-wiki-gigaword-100")

### Funciones de preprocesamiento a nivel de palabras

In [6]:
def remove_stopwords(text):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    palabras = [palabra.lower() for palabra in re.findall(r'\w+', text.lower())]
    text_lista = []
    for palabra in palabras:
        if palabra not in stopwords:
            text_lista.append(palabra)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

In [7]:
def get_lemmatizer(text):
    palabras = remove_stopwords(text)
    palabras = palabras.split()
    text_lista = []
    for palabra in palabras:
        nueva = lemmatizer.lemmatize(palabra)
        text_lista.append(nueva)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

In [9]:
def get_stemmer(text):
    palabras = remove_stopwords(text)
    palabras = palabras.split()
    text_lista = []
    for palabra in palabras:
        nueva = lancStemmer.stem(palabra)
        text_lista.append(nueva)
    nuevo_texto = ' '.join(text_lista)
    return nuevo_texto

In [10]:
def get_grams(text, ngram, method):
    if method == 'lemmatize':
        text = get_lemmatizer(text)
    elif method == 'stemmer':
        text = get_stemmer(text)
    else:
        raise ValueError('Method not found')
        
    text = text.split() 
    if ngram == 0:
        raise ValueError('ngram must be greater than 0 and less than 3')
    else:
        grams = ngrams(text, ngram)  
        result = []
        for ng in grams:
            result.append(' '.join(ng))
        return result

In [12]:
def make_matrix(text_1, text_2):
    text_set = set(text_1 + text_2)
    text_list = [text_1, text_2]
    matrix = []
    for text in text_list:
        vector = []
        for word in text_set:
            if word in text:
                vector.append(1 if text.count(word) == 1 else text.count(word))
            else:
                vector.append(0)
        matrix.append(vector)
    return matrix

In [14]:
def pre_process(folder_path, ngram, method):
    texto_preprocesado = []
    for fileid in os.listdir(folder_path):
        if fileid.endswith(".txt"):
            filepath = os.path.join(folder_path, fileid)
            with open(filepath, 'r', encoding='latin1', errors='ignore') as file:
                text = file.read()
                grams = get_grams(text, ngram, method)
                texto_preprocesado.append((fileid, grams))
    return texto_preprocesado

### Llamado a las funciones 

In [19]:
# Obtener n-gramas preprocesados
folder_path = "../../textos_plagiados"  # Ruta de la carpeta con los textos plagiados
preprocess_plagiados = pre_process(folder_path, 3, 'lemmatize')

folder_path_og = "../../docs_originales"  # Ruta de la carpeta con los textos originales
preprocess_originales = pre_process(folder_path_og, 3, 'lemmatize')

### Calculo de similitud de coseno por documento

In [28]:
from pprint import pprint

# Lista para almacenar las similitudes ordenadas
resultados = []

# Iterar sobre las listas y calcular la similitud de coseno
for id_plagiado, (name_plagiado, grams_plagiado) in enumerate(preprocess_plagiados, 1):
    for id_original, (name_original, grams_original) in enumerate(preprocess_originales, 1):
        
        similitud = cosine_similarity(make_matrix(grams_plagiado, grams_original))
        if similitud[0][1] != 0.0 and similitud[0][1] >= 0.2:
            resultados.append([name_plagiado, name_original, similitud[0][1]])

resultados.sort(key=lambda x: x[2], reverse=True)

# Utilizar pprint en lugar de print para un output más ordenado
print('Similitud por documento: \n ')
pprint(resultados)

Similitud por documento: 
 
[['FID-04.txt', 'org-045.txt', 0.9097744360902256],
 ['FID-03.txt', 'org-016.txt', 0.9063031052534367],
 ['FID-09.txt', 'org-109.txt', 0.8641489186670575],
 ['FID-08.txt', 'org-079.txt', 0.8619111987560517],
 ['FID-05.txt', 'org-085.txt', 0.859999999999999],
 ['FID-02.txt', 'org-104.txt', 0.6631210306331439],
 ['FID-01.txt', 'org-076.txt', 0.631683937674689],
 ['FID-06.txt', 'org-043.txt', 0.6000272127355744],
 ['FID-10.txt', 'org-007.txt', 0.5786876586795143],
 ['FID-07.txt', 'org-041.txt', 0.577178719558981]]


### Desarrollo del modelo

In [29]:
from gensim.models import FastText

In [32]:
# Entrenamiento del modelo FastText
def train_fasttext_model(folder_path, vector_size=100, window=5, min_count=1, epochs=10):
    corpus = []
    for fileid in os.listdir(folder_path):
        if fileid.endswith(".txt"):
            filepath = os.path.join(folder_path, fileid)
            with open(filepath, 'r', encoding='latin1', errors='ignore') as file:
                text = file.read()
                preprocessed_text = pre_process(folder_path_og, 3, 'lemmatize')
                corpus.append(preprocessed_text)
    model = FastText(vector_size=vector_size, window=window, min_count=min_count)
    model.build_vocab(sentences=corpus)
    model.train(sentences=corpus, total_examples=len(corpus), epochs=epochs)
    return model

TypeError: Either one of corpus_file or corpus_iterable value must be provided

In [31]:
folder_path_plagiados = "../../textos_plagiados"
folder_path_originales = "../../docs_originales"

fasttext_model = train_fasttext_model(folder_path_originales)

TypeError: Either one of corpus_file or corpus_iterable value must be provided