# Import Modules

In [28]:
import nltk
nltk.download("popular")
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import string
from nltk.corpus import stopwords
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

# Load Dataset

In [29]:
data = pd.read_csv("dataset.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label
0,0,Researchers have discovered a new species of b...,Scientists have found a previously unknown but...,1
1,1,The moon orbits the Earth in approximately 27....,Our natural satellite takes around 27.3 days t...,1
2,2,Water is composed of two hydrogen atoms and on...,H2O consists of 2 hydrogen atoms and 1 oxygen ...,1
3,3,The history of Rome dates back to 753 BC.,Rome has a long history that can be traced bac...,1
4,4,Pluto was once considered the ninth planet in ...,"In the past, Pluto was classified as the ninth...",1


In [30]:
data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,187
1,183


# Data Preprocessing

In [31]:
def preprocess_text(text_input):

    # Convert to lowercase
    lower_case_text = text_input.lower()

    # Remove punctuation
    cleaned_text = lower_case_text.translate(str.maketrans("", "", string.punctuation))

    # Remove stop words & Tokenized
    stop_words = set(stopwords.words("english"))

    tokenized_words = word_tokenize(cleaned_text, "english")
    final_words = [word for word in tokenized_words if word not in stopwords.words('english')]

    # Lemmatization
    lemma_words = " ".join(WordNetLemmatizer().lemmatize(word) for word in final_words)

    return lemma_words


# text = "This is my %#$$}? text to use for dummy test."
# response = preprocess_text(text)
# print(response)



In [32]:
data["source_text"] = data["source_text"].apply(preprocess_text)

data["plagiarized_text"] = data["plagiarized_text"].apply(preprocess_text)

data

Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label
0,0,researcher discovered new specie butterfly ama...,scientist found previously unknown butterfly s...,1
1,1,moon orbit earth approximately 273 day,natural satellite take around 273 day complete...,1
2,2,water composed two hydrogen atom one oxygen atom,h2o consists 2 hydrogen atom 1 oxygen atom,1
3,3,history rome date back 753 bc,rome long history traced back 753 bc,1
4,4,pluto considered ninth planet solar system,past pluto classified ninth planet sun planeta...,1
...,...,...,...,...
365,397,playing musical instrument enhances creativity,creativity enhanced playing musical instrument,0
366,398,studying history help understanding present,understanding present aided studying history,0
367,399,listening classical music improve focus,focus improved listening classical music,0
368,400,practicing yoga enhances physical flexibility,physical flexibility enhanced practicing yoga,0


# Vectorizer

In [44]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data["source_text"] + " " + data["plagiarized_text"])


In [45]:
Y = data["label"]

# Train Test Split

In [46]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Model Selection & Train

### Logistic  Regression model

In [57]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(x_train,y_train)

y_pred = lr_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print(" ")

# Generate classification report
classification_rep = classification_report(y_test, y_pred)
print("Classification Report: \n")
print(classification_rep)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: \n",cm)

Accuracy:  0.8513513513513513
 
Classification Report: 

              precision    recall  f1-score   support

           0       0.82      0.89      0.85        35
           1       0.89      0.82      0.85        39

    accuracy                           0.85        74
   macro avg       0.85      0.85      0.85        74
weighted avg       0.85      0.85      0.85        74

Confusion Matrix: 
 [[31  4]
 [ 7 32]]


### Random Forest model

In [58]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(x_train, y_train)

y_pred = rf_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print(" ")

# Generate classification report
classification_rep = classification_report(y_test, y_pred)
print("Classification Report: \n")
print(classification_rep)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: \n",cm)


Accuracy:  0.8243243243243243
 
Classification Report: 

              precision    recall  f1-score   support

           0       0.74      0.97      0.84        35
           1       0.96      0.69      0.81        39

    accuracy                           0.82        74
   macro avg       0.85      0.83      0.82        74
weighted avg       0.86      0.82      0.82        74

Confusion Matrix: 
 [[34  1]
 [12 27]]


### Naive Bayes model

In [61]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()

nb_model.fit(x_train, y_train)

y_pred = nb_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(" ")

# Generate classification report
classification_rep = classification_report(y_test, y_pred)
print("Classification Report: \n")
print(classification_rep)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: \n",cm)

Accuracy: 0.8918918918918919
 
Classification Report: 

              precision    recall  f1-score   support

           0       0.91      0.86      0.88        35
           1       0.88      0.92      0.90        39

    accuracy                           0.89        74
   macro avg       0.89      0.89      0.89        74
weighted avg       0.89      0.89      0.89        74

Confusion Matrix: 
 [[30  5]
 [ 3 36]]


### SVM model

In [62]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear', random_state=42)

svm_model.fit(x_train, y_train)

y_pred = svm_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(" ")

# Generate classification report
classification_rep = classification_report(y_test, y_pred)
print("Classification Report: \n")
print(classification_rep)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: \n",cm)


Accuracy: 0.8783783783783784
 
Classification Report: 

              precision    recall  f1-score   support

           0       0.86      0.89      0.87        35
           1       0.89      0.87      0.88        39

    accuracy                           0.88        74
   macro avg       0.88      0.88      0.88        74
weighted avg       0.88      0.88      0.88        74

Confusion Matrix: 
 [[31  4]
 [ 5 34]]


# Save Naive Bayes model and Vectorizor model

In [64]:
import pickle

pickle.dump(nb_model,open("nb_model.pkl",'wb'))
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pkl','wb'))

# Load Model and Vectorizer

In [65]:
model = pickle.load(open('nb_model.pkl','rb'))
tfidf_vectorizer = pickle.load(open('tfidf_vectorizer.pkl','rb'))

# Detection System

In [73]:
def plagiarism_detector(input_text):
    vectorized_text = tfidf_vectorizer.transform([input_text])
    result = model.predict(vectorized_text)
    return "Plagiarim Detected" if result[0] == 1 else "No Plagiarism"

In [76]:
# example ( it is a plagarized text)
input_text = 'Researchers have discovered a new species of butterfly in the Amazon rainforest.'
plagiarism_detector(input_text)

'Plagiarim Detected'

In [75]:
# example ( it has no plagarism)
input_text = 'Practicing yoga enhances physical flexibility.'
plagiarism_detector(input_text)

'No Plagiarism'

In [77]:
# sklearn version
import sklearn
sklearn.__version__

'1.3.2'