In [31]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import html
from difflib import SequenceMatcher
from collections import Counter

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

In [32]:
def preprocess_text(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Initialize lemmatizer and stopwords
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    # Preprocess each sentence
    processed_sentences = []
    for sentence in sentences:
        # Tokenize words and convert to lowercase
        words = word_tokenize(sentence.lower())
        
        # Remove stopwords and lemmatize
        words = [lemmatizer.lemmatize(word) for word in words if word.isalnum() and word not in stop_words]
        
        processed_sentences.append(' '.join(words))
    
    return processed_sentences, sentences

In [33]:
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().lower())
    return synonyms

def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

def lcs_similarity(s1, s2):
    matcher = SequenceMatcher(None, s1, s2)
    match = matcher.find_longest_match(0, len(s1), 0, len(s2))
    lcs_length = match.size
    max_length = max(len(s1), len(s2))
    return lcs_length / max_length if max_length > 0 else 0


def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def preprocess_text(text):
    sentences = sent_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    processed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words if word.isalnum() and word not in stop_words]
        processed_sentences.append(' '.join(words))
    
    return processed_sentences, sentences


In [53]:
def compare_texts(text1, text2):
    processed_sentences1, original_sentences1 = preprocess_text(text1)
    processed_sentences2, original_sentences2 = preprocess_text(text2)
    
    all_sentences = processed_sentences1 + processed_sentences2
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_sentences)
    
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    matches = []
    total_chars = sum(len(sent) for sent in original_sentences1)
    matched_chars = 0
    
    for i, (proc_sent1, orig_sent1) in enumerate(zip(processed_sentences1, original_sentences1)):
        for j, (proc_sent2, orig_sent2) in enumerate(zip(processed_sentences2, original_sentences2)):
            cosine_sim = similarity_matrix[i][j + len(processed_sentences1)]
            jaccard_sim = jaccard_similarity(set(proc_sent1.split()), set(proc_sent2.split()))
            lcs_sim = lcs_similarity(proc_sent1, proc_sent2)
            
            # Check for synonym overlap
            words1 = set(proc_sent1.split())
            words2 = set(proc_sent2.split())
            synonym_overlap = sum(1 for w1 in words1 for w2 in words2 if w2 in get_synonyms(w1))
            synonym_sim = synonym_overlap / max(len(words1), len(words2)) if max(len(words1), len(words2)) > 0 else 0
            
            # Combine similarity scores
            combined_sim = (cosine_sim + jaccard_sim + lcs_sim + synonym_sim) / 4
            
            if combined_sim >= 0.36:
                if combined_sim >= 0.8:
                    color = 'dark_green'
                elif combined_sim >= 0.7:
                    color = 'medium_green'
                else:
                    color = 'light_green'
                matches.append((orig_sent1, orig_sent2, color, combined_sim))
                matched_chars += len(orig_sent1)
    
    similarity_percentage = (matched_chars / total_chars) * 100 if total_chars > 0 else 0
    
    return matches, similarity_percentage

In [35]:
# def compare_texts(text1, text2):
#     # Preprocess both texts
#     processed_sentences1, original_sentences1 = preprocess_text(text1)
#     processed_sentences2, original_sentences2 = preprocess_text(text2)
    
#     # Combine all sentences for vectorization
#     all_sentences = processed_sentences1 + processed_sentences2
    
#     # Create TF-IDF vectors
#     vectorizer = TfidfVectorizer()
#     tfidf_matrix = vectorizer.fit_transform(all_sentences)
    
#     # Calculate cosine similarity between sentences
#     similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
#     # Find matches and similar phrases
#     matches = []
#     total_chars = sum(len(sent) for sent in original_sentences1)
#     matched_chars = 0
    
#     for i, (proc_sent1, orig_sent1) in enumerate(zip(processed_sentences1, original_sentences1)):
#         for j, (proc_sent2, orig_sent2) in enumerate(zip(processed_sentences2, original_sentences2)):
#             similarity = similarity_matrix[i][j + len(processed_sentences1)]
#             if similarity == 1.0:
#                 matches.append((orig_sent1, orig_sent2, 'dark_green'))
#                 matched_chars += len(orig_sent1)
#             elif similarity > 0.8:
#                 matches.append((orig_sent1, orig_sent2, 'medium_green'))
#                 matched_chars += len(orig_sent1)
#             elif similarity > 0.5:
#                 matches.append((orig_sent1, orig_sent2, 'light_green'))
#                 matched_chars += len(orig_sent1)
    
#     similarity_percentage = (matched_chars / total_chars) * 100 if total_chars > 0 else 0
    
#     return matches, similarity_percentage

In [45]:
def highlight_text_html(text, matches, is_text1=True):
    highlighted_text = text
    
    color_map = {
        'dark_green': '#00B050',
        'medium_green': '#92D050',
        'light_green': '#C6E0B4'  # New light green color
    }
    
    for sent1, sent2, color, _ in matches:
        html_color = color_map[color]
        sent_to_replace = sent1 if is_text1 else sent2
        highlighted_text = highlighted_text.replace(
            sent_to_replace, 
            f'<span style="background-color: {html_color};">{html.escape(sent_to_replace)}</span>'
        )
    
    # Replace newlines with <br> tags to maintain line spacing
    highlighted_text = highlighted_text.replace('\n', '<br>')
    
    return highlighted_text

In [55]:
def generate_html_output(text1, text2, highlighted_text1, highlighted_text2, similarity_percentage, matches):
    html_content = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Plagiarism Detection Result</title>
        <style>
            body {{ font-family: Arial, sans-serif; line-height: 1.6; padding: 20px; }}
            h1 {{ color: #333; }}
            .text-container {{ display: flex; justify-content: space-between; }}
            .text-box {{ width: 48%; border: 1px solid #ccc; padding: 10px; margin-bottom: 20px; }}
            h2 {{ color: #444; }}
            .similarity {{ font-size: 1.2em; font-weight: bold; margin-bottom: 20px; }}
            .legend {{ margin-bottom: 20px; }}
            .legend-item {{ display: inline-block; margin-right: 20px; }}
            .color-box {{ display: inline-block; width: 20px; height: 20px; margin-right: 5px; vertical-align: middle; }}
        </style>
    </head>
    <body>
        <h1>Content Similarity Checker</h1>
        <div class="similarity">Similarity Percentage: {similarity_percentage:.2f}%</div>
        <div class="legend">
            <div class="legend-item">
                <span class="color-box" style="background-color: #00B050;"></span>
                High Similarity (>=0.8)
            </div>
            <div class="legend-item">
                <span class="color-box" style="background-color: #92D050;"></span>
                Medium Similarity (0.7-0.79)
            </div>
            <div class="legend-item">
                <span class="color-box" style="background-color: #C6E0B4;"></span>
                Low Similarity (0.5-0.69)
            </div>
        </div>
        <div class="text-container">
            <div class="text-box">
                <h2>Text 1</h2>
                <p>{highlighted_text1}</p>
            </div>
           <div class="text-box">
                <h2>Text 2</h2>
                <p>{highlighted_text2}</p>
            </div>
        </div>
        <h2>Detailed Matches</h2>
        <ul>
            {''.join(f'<li>Similarity: {sim:.2f} - Text 1: "{s1}" | Text 2: "{s2}"</li>' for s1, s2, _, sim in matches)}
        </ul>
    </body>
    </html>
    """
    return html_content

In [54]:
# Example usage
text1 = """While Canva‚Äôs graphic design platform is blinkin‚Äô amazing, I found myself in a pool of confusion when trying to add an audio clip in my presentation. Why should such a little thing send me on a rollercoaster of emotions? Let me explain! 2/

You know how we always groan when our electronicsasters start playing up, right? ‚ÄòWhy this now? Or why can‚Äôt I find that little button?‚Äô Like that time when I desperately needed audio in my Spanish presentation for my narration. It began driving me up the wall. 3/

After panicking, I remembered seeing a small ‚Äòhelp‚Äô button in the corner of the Canva interface. "Help and documentation, they call it", AKA the lifesaver of my presentation. Isn‚Äôt it amazing, you got someone‚Äôs got your back when you‚Äôre lost in a sea of buttons and features? 4/

So, I clicked on it. It quickly redirected me to a page full of written guidelines, FAQs, and steps on how to navigate their features. Isn‚Äôt it brilliant that you don‚Äôt need to decode stuff on your own when you‚Äôre already freaking out about your racist sounding Spanish accent? 5/

I started scrolling down the help page and I loved how neatly it was arranged. A bunch of categories perfectly pointing out where to go for help based on your particular problem. But that obviously wasn‚Äôt enough to save my drowning presentation and time-sensitive predicament! 6/

I was still unable to find how to add audio, and then it hit me! The search bar, why didn‚Äôt I think of it sooner? I quickly typed in ‚Äòaudio‚Äô and pressed enter. A plethora of relevant articles popped up. Is there a better feeling than your sinking ship finally seeing land? 7/

I was greeted with a stepwise guide about how and where to add the audio. The way everything was laid out made it super easy to follow. It was the perfect antidote to my rising panic. Looking at the clear instructions, I felt like I was given a secret treasure map. 8/

I quickly went back to the interface and followed the instructions. The sense of triumph and relief when I finally saw the audio icon on the panel! It was like finding water in a desert. I excitedly uploaded my audio clip and gave it a play. A wave of euphoria hit me! 9/

I finally successfully adding my audio to the slides, and by following the instructions in the documentation, it was simpler than I initially thought! Who could‚Äôve thought a text box could hold such immense power of peace? Now, I knew the solution to my puzzle. 10/

So why did I just take you through my trial of the Canva audio feature? It‚Äôs simple! My story is a prime example of why help and documentation is such an important usability heuristic. It's a simple add-on with a powerful function: to assist users in their time of need. 11/"""

text2 = """I found myself super confused when trying to add an audio clip in my oral presentation.Why can‚Äôt I find that little button? I desperately NEEDED audio in my Spanish presentation for my narration. But it turns out most applications have solutions for when users get stuck. Let me explain! 2/

I couldn't find the option for adding in my own audio anywhere! I couldn't find the option for adding in my own audio anywhere! After panicking, I remembered seeing a small ‚Äòhelp‚Äô button in the corner of the Canva interface. 3/

So, I clicked on it. It quickly redirected me to a page full of written guidelines, FAQs, and steps on how to navigate their features. 4/

I started looking around help page and I loved how neatly it was arranged. The first thing I saw was a big search bar. Underneath it, was a couple of categories pointing out where to go for help based on your particular problem. I just quickly typed in ‚Äòaudio‚Äô and pressed enter. 5/

A bunch of relevant topics popped up. I chose the first option ‚Äìadd voiceover‚Äì and was greeted with a guide about how and where to add the audio. It turned out the feature was super hidden in the uploads tab. ü§¶üèª‚Äç‚ôÄÔ∏è 6/

The way everything was listed out in short numbered steps made it super easy to follow. I went right back to the interface and followed the instructions. I excitedly uploaded my audio clip and gave it a play. YAY now the presentation included my somewhat okay Spanish accent! 7/

I successfully added my audio to the slides, and by following the instructions in the documentation, it was simpler than I initially thought! Who could‚Äôve thought a search box could hold such immense power of peace? 8/
This whole process is the core of ‚ÄúHelp and Documentation‚Äù, an important feature that designers add in to ensure users have support. Everything is designed to be easy to use ‚Äìbut if there is an issue that comes up, help is where to go. 9/

Once a user resorts to a help feature, designers make sure it's easy to follow. It‚Äôs simple! They make directions short, broken down into steps, and clear. Just like the short list of clicks I was given to follow. 10/"""

matches, similarity_percentage = compare_texts(text1, text2)
highlighted_text1 = highlight_text_html(text1, matches, is_text1=True)
highlighted_text2 = highlight_text_html(text2, matches, is_text1=False)
html_output = generate_html_output(text1, text2, highlighted_text1, highlighted_text2, similarity_percentage, matches)

# Save the HTML output to a file
with open('plagiarism_detection_result.html', 'w', encoding='utf-8') as f:
    f.write(html_output)

print(f"Similarity Percentage: {similarity_percentage:.2f}%")
print("HTML output has been saved to 'plagiarism_detection_result.html'")


Similarity Percentage: 46.60%
HTML output has been saved to 'plagiarism_detection_result.html'
