In [19]:
# Useful libraries
import numpy as np
import torch
from keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer,  AutoModelForSequenceClassification
from tqdm import tqdm
import os
import pandas as pd
import re
 
# Load bert model
model_path = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_path,
                                         do_lower_case=True)
model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                         output_attentions=False,
                                                         output_hidden_states=True)
                                                        
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
 
def create_vector_from_text(tokenizer, model, text, MAX_LEN = 510):
  
    input_ids = tokenizer.encode(
                        text,
                        add_special_tokens = True,
                        max_length = MAX_LEN,                          
    )   
    results = pad_sequences([input_ids], maxlen=MAX_LEN, dtype="long",
                              truncating="post", padding="post")
    # Remove the outer list.
    input_ids = results[0]
    # Create attention masks   
    attention_mask = [int(i>0) for i in input_ids]
    # Convert to tensors.
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    # Add an extra dimension for the "batch" (even though there is only one
    # input in this batch.)
    input_ids = input_ids.unsqueeze(0)
    attention_mask = attention_mask.unsqueeze(0)
    # Put the model in "evaluation" mode, meaning feed-forward operation.
    model.eval()
    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers.
    with torch.no_grad():       
        logits, encoded_layers = model(
                                    input_ids = input_ids,
                                    token_type_ids = None,
                                    attention_mask = attention_mask,
                                    return_dict=False)

    layer_i = 12 # The last BERT layer before the classifier.
    batch_i = 0 # Only one input in the batch.
    token_i = 0 # The first token, corresponding to [CLS]
      
    # Extract the vector.
    vector = encoded_layers[layer_i][batch_i][token_i]
    # Move to the CPU and convert to numpy ndarray.
    vector = vector.detach().cpu().numpy()
    return vector
 
def create_vector_index(data: pd.DataFrame):
  
   # The list of all the vectors
   vectors = []
  
   # Get overall text data
   source_data = data.abstract.values
  
   # Loop over all the comment and get the embeddings
   for text in tqdm(source_data):
      
       # Get the embedding
       vector = create_vector_from_text(tokenizer, model, text)
      
       #add it to the list
       vectors.append(vector)
  
   data["vectors"] = vectors
   data["vectors"] = data["vectors"].apply(lambda emb: np.array(emb))
   data["vectors"] = data["vectors"].apply(lambda emb: emb.reshape(1, -1))
   return data

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
landing_txt_path = "../../../data_txt/files/"
data_list = []
for filename in os.listdir(landing_txt_path):
    if filename.endswith(".txt"):
        with open(landing_txt_path + filename, "r") as file:
            text = file.read().replace("\n", " ")
            text = re.sub(r'\s+', ' ', text) # remove extra whitespaces
        data_list.append({"paper_id": filename, "abstract": text})

data_df = pd.DataFrame(data_list)
data_df.head()

Unnamed: 0,paper_id,abstract
0,TP_2_Weiss_Gonzalo.txt,TRABAJO PRÁCTICO N°2 Curso: K5052 Profesor: A...
1,TP6-Gariglio.txt,TP6 - Sistemas emergente 1) Cuáles son los 4 p...
2,UTNMKT2016-MoraLeandro-TP4.txt,Universidad Tecnológica Nacional Facultad Reg...
3,Tp2 Filannino marketing en internet (2).txt,Materia: Marketing en internet TP: N°2 - La l...
4,TP4 - Comercio Electronico - Marina Pross.txt,UNIVERSIDAD TECNOLÓGICA NACIONAL Facultad Reg...


In [21]:
vector_index = create_vector_index(data_df)
vector_index.head()

  0%|          | 0/296 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 296/296 [00:58<00:00,  5.05it/s]


Unnamed: 0,paper_id,abstract,vectors
0,TP_2_Weiss_Gonzalo.txt,TRABAJO PRÁCTICO N°2 Curso: K5052 Profesor: A...,"[[-0.3493752, 0.0323844, 0.5065808, 0.3307109,..."
1,TP6-Gariglio.txt,TP6 - Sistemas emergente 1) Cuáles son los 4 p...,"[[-0.38969195, 0.21918297, 0.3831724, 0.573663..."
2,UTNMKT2016-MoraLeandro-TP4.txt,Universidad Tecnológica Nacional Facultad Reg...,"[[-0.2442474, 0.28733182, 0.4294808, 0.4896812..."
3,Tp2 Filannino marketing en internet (2).txt,Materia: Marketing en internet TP: N°2 - La l...,"[[-0.23322998, 0.1330786, 0.5930347, 0.4986455..."
4,TP4 - Comercio Electronico - Marina Pross.txt,UNIVERSIDAD TECNOLÓGICA NACIONAL Facultad Reg...,"[[-0.13734117, 0.114498906, 0.40366158, 0.1930..."


### Implement Plagiarism Analysis

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

def process_document(text: str):
  """
  Create a vector for given text and adjust it for cosine similarity search
  """
  text_vect = create_vector_from_text(tokenizer, model, text)
  text_vect = np.array(text_vect)
  text_vect = text_vect.reshape(1, -1)
  return text_vect

def is_plagiarism(similarity_score, plagiarism_threshold):
  is_plagiarism = False
  if similarity_score >= plagiarism_threshold:
      is_plagiarism = True
  return is_plagiarism
 
def run_plagiarism_analysis(query_text, data, plagiarism_threshold=0.8):
  top_N=3

  # Preprocess the document to get the required vector for similarity analysis
  query_vect = process_document(query_text)

  # Run similarity Search
  data["similarity"] = data["vectors"].apply(lambda x: cosine_similarity(query_vect, x))
  data["similarity"] = data["similarity"].apply(lambda x: x[0][0])
  similar_articles = data.sort_values(by='similarity', ascending=False)[1:top_N+1]
  formated_result = similar_articles[["abstract", "paper_id", "similarity"]].reset_index(drop = True)
  similarity_score = formated_result.iloc[0]["similarity"]
  most_similar_article = formated_result.iloc[0]["abstract"]
  is_plagiarism_bool = is_plagiarism(similarity_score, plagiarism_threshold)
  
  plagiarism_decision = {'similarity_score': similarity_score,
                          'is_plagiarism': is_plagiarism_bool,
                          'most_similar_article': most_similar_article,
                          'article_submitted': query_text
                          }
  return plagiarism_decision

### Run Plagiarism Analysis

In [23]:
new_incoming_text_ptah = ("../../../data_txt/files/Economía de experiencia.txt")
with open(new_incoming_text_ptah, "r") as file:
    new_incoming_text = file.read().replace("\n", " ")
    new_incoming_text = re.sub(r'\s+', ' ', new_incoming_text) # remove extra whitespaces

plagiarism_decision = run_plagiarism_analysis(new_incoming_text, vector_index, plagiarism_threshold=0.8)

In [24]:
plagiarism_decision

{'similarity_score': 0.9999999,
 'is_plagiarism': True,
 'most_similar_article': ' Márketing en internet y nueva economía Economía de experiencia Profesores: Dr. Alejandro Prince Ing. Hernán Borré Ing. Maximiliano Bracho Alumno: Gallazzi, Pablo Gabriel 143.370-2 Fecha de Presentación 10/04/2017 Cuestionario: 1.- ¿Qué 3 elementos hacen resurgir con fuerza la idea de una economía de experiencia? 2.- Defina y caracterice una experiencia. Diferencias con Producto y Servicio. 3.- Explique y grafique las dimensiones y campos de la experiencia. 4.- Describa “impresiones” y sus distintas dimensiones. 5.- De 3 ejemplos distintos (reales si conoce, o invente) de experiencias con estimulación de los sentidos. 6.- Qué es la personalización masiva? Explique la progresión del valor. 7.- Cuáles son las ventajas para la empresa de la personalización masiva? 8.- Describa los 4 tipos de personalización masiva. 9.- ¿Qué aporta el ciberespacio al tema “sacrificio del cliente”? Respuestas: 1.- Los tres ele

In [25]:
another_incoming_text = "The purpose of this study is to investigate the relationship between the level of education and the level of income in the United States. The study uses data from the U.S. Census Bureau to examine the relationship between education and income. The results show that there is a positive relationship between education and income. People with higher levels of education tend to earn more money than people with lower levels of education. The study also finds that the relationship between education and income is stronger for"

plagiarism_decision = run_plagiarism_analysis(another_incoming_text, vector_index, plagiarism_threshold=0.8)

In [26]:
plagiarism_decision

{'similarity_score': 0.6034411,
 'is_plagiarism': False,
 'most_similar_article': 'MMAARRCCEELLOO BBRROOSSII 111155..220033--88 Curso K50 52 Página 1 de 5 MMaarrkkeettiinngg eenn IInntteerrnneett yy NNuueevvaa EEccoonnoommííaa -- 11eerr CCUUAATTRRIIMMEESSTTRREE 22001177 -- PPRROOFFEESSOORR:: AAlleejjaannddrroo PPrriinnccee AAYYUUDDAANNTTEESS:: HHeerrnnáánn BBoorrrréé –– MMaaxxiimmiilliiaannoo BBrraacchhoo AALLUUMMNNOO:: MMaarrcceelloo BBrroossii LLEEGGAAJJOO:: 111155..220033--88 3 de abril 2017 Trabajo Práctico Nº 2 ““LLaa LLaarrggaa CCoollaa”” MMAARRCCEELLOO BBRROOSSII 111155..220033--88 Curso K50 52 Página 2 de 5 Contenidos ¿CÓMO DEFINE ANDERSO N A “LA LARGA COLA”? ¿POR QUÉ ASEGURA QUE ES EL PRESENTE Y FUTURO DE LA ECONOMÍA MINORIST A? GRAFIQUE . ................................ ...... 3 DEFINA ECONOMÍA DE E SCASEZ. ¿CÓMO INFLUY E INTERNET EN ESTE C ONCEPTO? .... 3 RELACIONE LOS TÉRMIN OS: PRINCIPIO DE PAR ETO, ECONOMÍA DE MAS AS Y LARGA COLA . ................................ ......