Title: Plagiarism Detection System using NLP

Objective:
To detect similarity between documents using text preprocessing,
Bag of Words, and cosine similarity.

Tools Used:
Python,
NLTK,
Scikit-learn,
Jupyter Notebook.


In [1]:
!pip install scikit-learn nltk




In [2]:
import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
doc1 = "Artificial Intelligence helps humans solve problems easily."
doc2 = "Artificial Intelligence helps people solve problems quickly."
doc3 = "Football is a popular sport played worldwide."

documents = [doc1, doc2, doc3]


In [5]:
def preprocess(text):

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]

    # Join words back
    return " ".join(words)


In [6]:
cleaned_docs = []

for doc in documents:
    cleaned_docs.append(preprocess(doc))

cleaned_docs


['artificial intelligence helps humans solve problems easily',
 'artificial intelligence helps people solve problems quickly',
 'football popular sport played worldwide']

In [7]:
vectorizer = CountVectorizer()

bow_matrix = vectorizer.fit_transform(cleaned_docs)

print("Vocabulary:")
print(vectorizer.get_feature_names_out())

print("\nBoW Matrix:")
print(bow_matrix.toarray())


Vocabulary:
['artificial' 'easily' 'football' 'helps' 'humans' 'intelligence' 'people'
 'played' 'popular' 'problems' 'quickly' 'solve' 'sport' 'worldwide']

BoW Matrix:
[[1 1 0 1 1 1 0 0 0 1 0 1 0 0]
 [1 0 0 1 0 1 1 0 0 1 1 1 0 0]
 [0 0 1 0 0 0 0 1 1 0 0 0 1 1]]


In [8]:
similarity_matrix = cosine_similarity(bow_matrix)

print("Similarity Matrix:")
print(similarity_matrix)


Similarity Matrix:
[[1.         0.71428571 0.        ]
 [0.71428571 1.         0.        ]
 [0.         0.         1.        ]]


In [9]:
for i in range(len(documents)):
    for j in range(i+1, len(documents)):

        score = similarity_matrix[i][j]

        print(f"Similarity between Document {i+1} and Document {j+1}: {score:.2f}")

        if score > 0.7:
            print("Possible plagiarism detected")
        else:
            print("No plagiarism")

        print()


Similarity between Document 1 and Document 2: 0.71
Possible plagiarism detected

Similarity between Document 1 and Document 3: 0.00
No plagiarism

Similarity between Document 2 and Document 3: 0.00
No plagiarism

