In [1]:
import pandas as pd
from tqdm import tqdm

In [43]:
import os


def preprocess_data(data_path, sample_size):
    # Read the data from all text files in the directory
    data = []
    for file_name in os.listdir(data_path):
        if file_name.endswith('.txt'):
            with open(os.path.join(data_path, file_name), 'r', encoding='utf-8') as f:
                text = f.read()
                data.append({'abstract': text})

    # Convert list of dictionaries to pandas DataFrame
    data = pd.DataFrame(data)

    # Drop duplicates and empty abstracts
    data = data.drop_duplicates().dropna(subset=['abstract']).reset_index(drop=True)

    # Get "sample_size" random articles
    data = data.sample(sample_size).reset_index(drop=True)

    return data


data_path = "plagiarism_database"
source_data = preprocess_data(data_path, 10)

In [3]:
import torch
# from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing.sequence import pad_sequences
from transformers import BertTokenizer,  AutoModelForSequenceClassification

# Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
model_path = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(model_path, 
                                          do_lower_case=True)

model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                          output_attentions=False,
                                                          output_hidden_states=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [5]:
import torch

def create_vector_from_text(tokenizer, model, text, MAX_LEN = 510):

    input_ids = tokenizer.encode(text, add_special_tokens = True, max_length = MAX_LEN,)    
    input_ids = torch.tensor([input_ids])
    input_ids = input_ids.squeeze(0)
    
    if input_ids.shape[0] > MAX_LEN:
        input_ids = input_ids[:MAX_LEN]
    
    pad_len = MAX_LEN - input_ids.shape[0]
    
    if pad_len > 0:
        pad = torch.zeros(pad_len, dtype=torch.long)
        input_ids = torch.cat((input_ids, pad), dim=0)
    
    with torch.no_grad():
        last_hidden_states = model(input_ids.unsqueeze(0))[0]
    
    return last_hidden_states.squeeze().numpy()


In [44]:

import numpy as np

def create_vector_database(data):

        # The list of all the vectors
    vectors = []

        # Get overall text data
    source_data = data.abstract.values

        # Loop over all the comment and get the embeddings
    for text in tqdm(source_data):

            # Get the embedding 
        vector = create_vector_from_text(tokenizer, model, text)

            #add it to the list
        vectors.append(vector)

    data["vectors"] = vectors
    data["vectors"] = data["vectors"].apply(lambda emb: np.array(emb))
    data["vectors"] = data["vectors"].apply(lambda emb: emb.reshape(1, -1))

    return data

In [45]:
vector_database = create_vector_database(source_data)

100%|██████████████████████| 10/10 [00:29<00:00,  2.93s/it]


In [57]:
from transformers import MarianMTModel, MarianTokenizer
def process_document(text):
    """
    Create a vector for given text and adjust it for cosine similarity search
    """
    text_vect = create_vector_from_text(tokenizer, model, text)
    text_vect = np.array(text_vect)
    text_vect = text_vect.reshape(1, -1)

    return text_vect

    
def is_plagiarism(similarity_score, plagiarism_threshold):

    is_plagiarism = False

    if(similarity_score >= plagiarism_threshold):
        is_plagiarism = True

    return is_plagiarism

def run_plagiarism_analysis(query_text, data, plagiarism_threshold):

    top_N = 5

    # Check the language of the query/incoming text and translate if required. 
    document_translation = query_text

    # Preprocess the document to get the required vector for similarity analysis
    query_vect = process_document(document_translation)

    # Run similarity search
    data["similarity"] = data["vectors"].apply(lambda x: cosine_similarity(query_vect, x))
    data["similarity"] = data["similarity"].apply(lambda x: x[0][0])

    similar_articles = data.sort_values(by='similarity', ascending=False).head(top_N+1)
    formated_result = similar_articles[["abstract", "similarity"]].reset_index(drop=True)

    plagiarism_decisions = []
    for i in range(top_N):
        similarity_score = formated_result.iloc[i]["similarity"]
        most_similar_article = formated_result.iloc[i]["abstract"]
        is_plagiarism_bool = is_plagiarism(similarity_score, plagiarism_threshold)

        plagiarism_decision = {
            'similarity_score': similarity_score,
            'is_plagiarism': is_plagiarism_bool,
            'most_similar_article': most_similar_article,
        }

        plagiarism_decisions.append(plagiarism_decision)
        plagiarism_decisions.append('\n')


    return plagiarism_decisions



In [11]:
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [66]:
import tkinter as tk
from tkinter import filedialog
import difflib
from tkinter import scrolledtext

root = tk.Tk()

file_path = filedialog.askopenfilename(initialdir="./", title="Select File",
                                       filetypes=(("Text Files", "*.txt"), ("All Files", "*.*")))

with open(file_path, 'r', encoding='utf-8') as f:
    input_text = f.read()

plagiarism_threshold = 0.5

analysis_results = run_plagiarism_analysis(input_text, vector_database, plagiarism_threshold)

import difflib

import difflib

def phrase_similarity(phrase1, phrase2):
    """
    Compute the similarity between two phrases.
    """
    seq = difflib.SequenceMatcher(a=phrase1.lower(), b=phrase2.lower())
    return seq.ratio()

def find_similar_phrases(line1, line2):
    """
    Find similar phrases between two lines.
    """
    phrases1 = line1.split()
    phrases2 = line2.split()

    similar_phrases = []

    for phrase1 in phrases1:
        for phrase2 in phrases2:
            similarity = phrase_similarity(phrase1, phrase2)
            if similarity > 0.8:
                similar_phrases.append((phrase1, phrase2))

    return similar_phrases



def highlight_similar_lines(similar_text):
    global input_text

    # Split the incoming text and input text into lines
    incoming_lines = similar_text.splitlines(keepends=True)
    input_lines = input_text.splitlines(keepends=True)

    # Create a scrolled text widget for the incoming text
    incoming_scroll = tk.scrolledtext.ScrolledText(root, width=40, height=20)
    incoming_scroll.insert(tk.END, similar_text)
    incoming_scroll.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

    # Create a scrolled text widget for the input text
    input_scroll = tk.scrolledtext.ScrolledText(root, width=40, height=20)
    input_scroll.insert(tk.END, input_text)
    input_scroll.pack(side=tk.RIGHT, fill=tk.BOTH, expand=True)

    # Highlight the similar lines in the incoming text
    for i, incoming_line in enumerate(incoming_lines):
        # Find the most similar line in the input text
        most_similar_line = None
        highest_similarity = -1
        for input_line in input_lines:
            similarity = line_similarity(incoming_line, input_line)
            if similarity > highest_similarity:
                highest_similarity = similarity
                most_similar_line = input_line

        # Find the similar phrases in both lines
        incoming_phrases = find_similar_phrases(incoming_line, most_similar_line)
        input_phrases = find_similar_phrases(most_similar_line, incoming_line)

        # Highlight the similar phrases in both lines
        for phrase in incoming_phrases:
            start = incoming_line.find(phrase)
            end = start + len(phrase)
            incoming_scroll.tag_add(str(i), f"{i}.{start}", f"{i}.{end}")
            incoming_scroll.tag_config(str(i), background="yellow")

        for phrase in input_phrases:
            start = most_similar_line.find(phrase)
            end = start + len(phrase)
            input_scroll.tag_add(str(i), f"{i}.{start}", f"{i}.{end}")
            input_scroll.tag_config(str(i), background="yellow")



# function to create a button for each plagiarism decision
def create_button(decision, index):
    similarity_score = decision['similarity_score']
    is_plagiarism_bool = decision['is_plagiarism']

    button = tk.Button(root, text="{} - Similarity Score: {:.2f}%".format(index, similarity_score * 100),
                       command=lambda: highlight_similar_lines(decision['most_similar_article']),
                       bg='red' if is_plagiarism_bool else 'green')
    button.pack()


# create a button for each plagiarism decision
for i, result in enumerate(analysis_results):
    if isinstance(result, dict):
        create_button(result, i + 1)

root.mainloop()


Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Users\gvsvv\AppData\Local\Programs\Python\Python310\lib\tkinter\__init__.py", line 1921, in __call__
    return self.func(*args)
  File "C:\Users\gvsvv\AppData\Local\Temp\ipykernel_16772\3754327508.py", line 101, in <lambda>
    command=lambda: highlight_similar_lines(decision['most_similar_article']),
  File "C:\Users\gvsvv\AppData\Local\Temp\ipykernel_16772\3754327508.py", line 82, in highlight_similar_lines
    start = incoming_line.find(phrase)
TypeError: must be str, not tuple
