### 1. Import Libraries

In [2]:
import os
import csv
import math
import string
from collections import Counter
import re
import warnings
warnings.filterwarnings("ignore")

# datascience libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

# sklearn library
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

# algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

### 2. Import Dataset

In [3]:
# Importing Para level Dataset
data = load_files(r"C:\\Users\\hasee\\Desktop\\baby projects\\0 Dataset\\Para_level", encoding='utf-8')

# Convert the list of Text to a pandas DataFrame
df_urdu_text = pd.DataFrame({"Urdu Text": data.data})

# Convert the list of labels to a pandas DataFrame
df_label=pd.DataFrame({"Label": data.target})
df_label = df_label.replace([1], 'Plagiarized')
df_label = df_label.replace([0], 'Non-Plagiarized')

# Convert the list of file names to a pandas DataFrame
file_names = pd.DataFrame({"File Names": data.filenames})

df_para = pd.concat([file_names, df_urdu_text, df_label], axis=1)
df_para

Unnamed: 0,File Names,Urdu Text,Label
0,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,وقت کی قدر شناسی، کامیابی کا راز\r\nکائنات کے ...,Plagiarized
1,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,جنوبی ایشیا ئی کنفیڈریشن کی سازش، پردہ اُٹھتا ...,Non-Plagiarized
2,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,چھ ہزار چار سو اسی گھنٹے\r\nمیں نے تنہائی کو ا...,Non-Plagiarized
3,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,وقت کی پابندی کو عادت بنانا کیسے ممکن؟\r\nوقت ...,Plagiarized
4,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,ورزش کے پانچ حیران کن فوائد\r\nایسی خبریں آتی...,Plagiarized
...,...,...,...
5431,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,وقت کی پابندی\r\nاللہ تعالیٰ نے اپنی قدرتِ کام...,Plagiarized
5432,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,اسلام میں تعلیم نسوان پر تاکید\r\nتعلیم نسواں‘...,Plagiarized
5433,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,پھر یہ ہنگامہ اے خدا کیا ہے؟\r\nگستاخانہ فلم ک...,Non-Plagiarized
5434,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,ماؤنٹ بلینک \r\nہم رات کے دس بجے شامونی پہنچے‘...,Non-Plagiarized


In [4]:
# Rearrange Para level Data in Balanced Way

plagiarized_df_para = df_para[df_para['Label'] == 'Plagiarized'].reset_index(drop=True)
non_plagiarized_df_para = df_para[df_para['Label'] == 'Non-Plagiarized'].reset_index(drop=True)

dataset_para = pd.concat([plagiarized_df_para, non_plagiarized_df_para], keys=['Plagiarized', 'Non-plagiarized']).sort_index(level=1).reset_index(drop=True)
dataset_para

Unnamed: 0,File Names,Urdu Text,Label
0,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,جنوبی ایشیا ئی کنفیڈریشن کی سازش، پردہ اُٹھتا ...,Non-Plagiarized
1,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,وقت کی قدر شناسی، کامیابی کا راز\r\nکائنات کے ...,Plagiarized
2,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,چھ ہزار چار سو اسی گھنٹے\r\nمیں نے تنہائی کو ا...,Non-Plagiarized
3,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,وقت کی پابندی کو عادت بنانا کیسے ممکن؟\r\nوقت ...,Plagiarized
4,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,﻿شاہ محمود قریشی ، بابا ٹلّ اور مَیں!\r\n\t\t\...,Non-Plagiarized
...,...,...,...
5431,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,وقت کی پابندی\r\nاللہ تعالیٰ نے اپنی قدرتِ کام...,Plagiarized
5432,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,پھر یہ ہنگامہ اے خدا کیا ہے؟\r\nگستاخانہ فلم ک...,Non-Plagiarized
5433,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,اسلام میں تعلیم نسوان پر تاکید\r\nتعلیم نسواں‘...,Plagiarized
5434,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,ماؤنٹ بلینک \r\nہم رات کے دس بجے شامونی پہنچے‘...,Non-Plagiarized


In [5]:
# Importing Sentence Level Dataset
data = load_files(r"C:\\Users\\hasee\\Desktop\\baby projects\\0 Dataset\\Sentence_level", encoding='utf-8')

# Convert the list of Text to a pandas DataFrame
df_urdu_text = pd.DataFrame({"Urdu Text": data.data})

# Convert the list of labels to a pandas DataFrame
df_label=pd.DataFrame({"Label": data.target})
df_label = df_label.replace([1], 'Plagiarized')
df_label = df_label.replace([0], 'Non-Plagiarized')

# Convert the list of file names to a pandas DataFrame
file_names = pd.DataFrame({"File Names": data.filenames})

df_sen = pd.concat([file_names, df_urdu_text, df_label], axis=1)
df_sen

Unnamed: 0,File Names,Urdu Text,Label
0,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,وقت کی قدر شناسی، کامیابی کا راز\r\nکائنات کے ...,Plagiarized
1,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,اب دھرنوں سے آگے بڑھا جائے\r\nآپ سے پوچھا جا...,Non-Plagiarized
2,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,القدس میں گزارے صدیوں پر محیط لمحے\r\nمقبوضہ ی...,Non-Plagiarized
3,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,وقت کی پابندی کو عادت بنانا کیسے ممکن؟\r\nوقت ...,Plagiarized
4,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,ورزش کے پانچ حیران کن فوائد\r\nایسی خبریں آتی...,Plagiarized
...,...,...,...
5431,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,وقت کی پابندی\r\nاللہ تعالیٰ نے اپنی قدرتِ کام...,Plagiarized
5432,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,اسلام میں تعلیم نسوان پر تاکید\r\nتعلیم نسواں،...,Plagiarized
5433,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,کارپوریٹ رمضان اور نیا اسلام\r\nکرسمس امریکہ ا...,Non-Plagiarized
5434,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,ایک صحافی اور ایک جنرل\r\n \r\nایک مرحوم امریک...,Non-Plagiarized


In [6]:
# Rearrange sentence level Data in Balanced Way

plagiarized_df_sen = df_sen[df_sen['Label'] == 'Plagiarized'].reset_index(drop=True)
non_plagiarized_df_sen = df_sen[df_sen['Label'] == 'Non-Plagiarized'].reset_index(drop=True)

dataset_sen = pd.concat([plagiarized_df_sen, non_plagiarized_df_sen], keys=['Plagiarized', 'Non-plagiarized']).sort_index(level=1).reset_index(drop=True)
dataset_sen

Unnamed: 0,File Names,Urdu Text,Label
0,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,اب دھرنوں سے آگے بڑھا جائے\r\nآپ سے پوچھا جا...,Non-Plagiarized
1,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,وقت کی قدر شناسی، کامیابی کا راز\r\nکائنات کے ...,Plagiarized
2,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,القدس میں گزارے صدیوں پر محیط لمحے\r\nمقبوضہ ی...,Non-Plagiarized
3,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,وقت کی پابندی کو عادت بنانا کیسے ممکن؟\r\nوقت ...,Plagiarized
4,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,ایکیو پریشر\r\nکچھ عرصہ قبل کی بات ہے کہ مجھے ...,Non-Plagiarized
...,...,...,...
5431,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,وقت کی پابندی\r\nاللہ تعالیٰ نے اپنی قدرتِ کام...,Plagiarized
5432,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,کارپوریٹ رمضان اور نیا اسلام\r\nکرسمس امریکہ ا...,Non-Plagiarized
5433,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,اسلام میں تعلیم نسوان پر تاکید\r\nتعلیم نسواں،...,Plagiarized
5434,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,ایک صحافی اور ایک جنرل\r\n \r\nایک مرحوم امریک...,Non-Plagiarized


In [7]:
# combine Sentence and Para Level Dataset

dataset_complete = pd.concat([dataset_sen, dataset_para], axis=0).reset_index(drop=True)
dataset_complete

Unnamed: 0,File Names,Urdu Text,Label
0,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,اب دھرنوں سے آگے بڑھا جائے\r\nآپ سے پوچھا جا...,Non-Plagiarized
1,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,وقت کی قدر شناسی، کامیابی کا راز\r\nکائنات کے ...,Plagiarized
2,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,القدس میں گزارے صدیوں پر محیط لمحے\r\nمقبوضہ ی...,Non-Plagiarized
3,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,وقت کی پابندی کو عادت بنانا کیسے ممکن؟\r\nوقت ...,Plagiarized
4,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,ایکیو پریشر\r\nکچھ عرصہ قبل کی بات ہے کہ مجھے ...,Non-Plagiarized
...,...,...,...
10867,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,وقت کی پابندی\r\nاللہ تعالیٰ نے اپنی قدرتِ کام...,Plagiarized
10868,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,پھر یہ ہنگامہ اے خدا کیا ہے؟\r\nگستاخانہ فلم ک...,Non-Plagiarized
10869,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,اسلام میں تعلیم نسوان پر تاکید\r\nتعلیم نسواں‘...,Plagiarized
10870,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,ماؤنٹ بلینک \r\nہم رات کے دس بجے شامونی پہنچے‘...,Non-Plagiarized


### 3. Extract All Stylometry Features

In [8]:
# Funtion to Extract All Stylometry Features

def calculate_text_features(text):
    
    # 1. Word-Level stylometry features (10)
    # 2. Character-Level stylometry features (16)
    # 3. Sentence-Level stylometry features (7)
    # 4. Para-Level stylometry features (8)
    # 5. Document-Level stylometry features (1)
    # 6. Textual Entropy stylometry features (1)
    
    #******** 1. Word-Level stylometry features (10) ********
    
    ##1 total number of words
    words = text.split()
    total_words = len(words)
 
    ##2 total number of lines
    lines = text.split('\n')
    total_lines = len(lines)

    ##3 total number of empty lines
    empty_line_count = sum(1 for line in lines if not line.strip())

    ##4 average words length
    total_word_length = sum(len(word) for word in words)
    average_word_length = total_word_length / len(words) if len(words) > 0 else 0
    
    ##5 ratio of words with length 3
    len_3=len([word for word in words if len(word) == 3])
    ratio_of_3 = round(len_3/total_words if total_words > 0 else 0,2)        

    ##6 ratio of words with length 4
    len_4=len([word for word in words if len(word) == 4])
    ratio_of_4 = round(len_4/total_words if total_words > 0 else 0,2)   
    
    ##7 ratio of Long words(>=8)
    long_words = len([word for word in words if len(word) >= 8])
    ratio_of_long_words = round(long_words/total_words if total_words > 0 else 0,2)

    ##8 total number of unique words
    unique_words = set(words)
    total_unique_words = len(unique_words)    
     
    ##9 Total Punctuations count,No. of Punctuations, Frequency of each Punctuation
    # Define the list of punctuations you want to count
    punctuations = ["؟", "!", "?", "...", "؛", ".", "،", ")", "(", "۔"]
    characters = re.findall(r'.', text)
    punctuation_freq = {}
    punctuation_count = 0
    for char in characters:
        if char in punctuations:
            punctuation_freq[char] = punctuation_freq.get(char, 0) + 1
            punctuation_count += 1
          
    ##10 Punctuation Count / Total Words
    punctuation_ratio = punctuation_count/total_words if total_words > 0 else 0
    
        
    
    
    
    
    #******** 2. Character-Level stylometry features (16) ********
    
    ##1 Total commas
    comma_count = text.count('،')    

    ##2 Total dashes
    dashes_count = text.count('۔')
    
    ##3 Total open_parentheses
    open_parentheses_count = text.count('(')
    
    ##4 Total close_parentheses
    close_parentheses_count = text.count(')')

    ##5 Total semicolons
    semicolons_count = text.count('؛')

    ##6 Total white spaces
    white_spaces_count = text.count(' ')

    ##7 Total question_marks
    question_marks_count = text.count('؟')

    ##8 Total exclamation marks
    exclamation_marks_count = text.count('!')

    ##9 Total ampersands
    ampersands_count = text.count('&')

    ##10 Total percentage signs
    percentage_signs = text.count('%')

    ##11 Total single quotes
    number_of_left_single_quotes = text.count('‘')
    number_of_right_single_quotes = text.count('’')
    number_of_single_quotes = number_of_left_single_quotes + number_of_right_single_quotes

    ##12 Total double quotes
    number_of_left_double_quotes = text.count('“')
    number_of_right_double_quotes = text.count('”')
    number_of_double_quotes = number_of_left_double_quotes + number_of_right_double_quotes

    ##13 Total colons
    colons_count = text.count(':')
    
    ##14 Total characters without spaces 
    number_of_characters_without_spaces = len(text.replace(" ",""))

    ##15 Total digits
    digit_count = 0
    for character in text:
        if character.isdigit():
            digit_count += 1

    ##16 Total brackets
    no_of_all_brackets = 0
    for b in text:
        if b == '(' or b == '{' or b == '[' or b == ')' or b == '}' or b == ']':
            no_of_all_brackets = no_of_all_brackets + 1   
    
    
    
    
    
    
    #******** 3. Sentence-Level stylometry features (7) ********
    
    ##1 total number of sentences
    sentences = re.split(r'(?<=[.۔؟!؛])\s+', text)
    total_sentences = len(sentences)
    if len(text) != 0 and total_sentences == 0:
        total_sentences += 1

    ##2 average no. of words per sentence
    sentence_lengths = [len(sentence.split()) for sentence in sentences]
    average_sentence_length_in_words = sum(sentence_lengths) / total_sentences if total_sentences > 0 else 0
    
    ##3 average no. of Characters per sentence or average sentence length
    total_chars = sum(len(sentence) for sentence in sentences)
    average_sentence_length_in_char = total_chars / total_sentences if total_sentences > 0 else 0
    
    ##4 min sentence length
    min_sentence_length = min(len(sentence.split()) for sentence in sentences)
    
    ##5 max sentence length
    max_sentence_length = max(len(sentence.split()) for sentence in sentences)
    
    ##6 average no. of White Spaces per sentence
    average_spaces_per_sentence = sum(sentence.count(' ') for sentence in sentences) / len(sentences) if sentences else 0    
    
    ##7 percentage of question sentences
    total_question_sentences = sum(1 for sentence in sentences if sentence.endswith("؟"))
    
    
    
    
    #******** 4. Para-Level stylometry features (8) ********    
    
    ##1 total number of paragraphs
    # Split the text into paragraphs based on one or more empty lines
    paragraphs = re.split(r'\n\s*\n', text)
    total_paragraphs = len(paragraphs)
    
    ##2 average paragraph length
    total_length = sum(len(paragraph) for paragraph in paragraphs)
    average_paragraph_length = total_length / total_paragraphs if total_paragraphs > 0 else 0
    
    ##3 min paragraph length
    min_paragraph_length = min(len(p) for p in paragraphs)
    
    ##4 max paragraph length
    max_paragraph_length = max(len(p) for p in paragraphs)
    
    ##5 average no. of words per paragraph
    average_words_per_paragraph = sum(len(paragraph.split()) for paragraph in paragraphs) / len(paragraphs)
    
    ##6 average no. of Sentences per paragraph
    sentences_per_paragraph = [len(re.findall(r'[\w۔]+', paragraph)) for paragraph in paragraphs]
    average_sentences_per_paragraph = sum(sentences_per_paragraph) / total_paragraphs if total_paragraphs > 0 else 0

    ##7 average no. of Punctuations per paragraph
    punctuation_ratio_per_para = punctuation_count/total_words if total_words > 0 else 0
    
    ##8 averageno. of Question Marks per paragraph
    average_question_marks_per_paragraph = question_marks_count / total_paragraphs if total_paragraphs > 0 else 0
    
    
    
      
    #******** 5. Document-Level stylometry features (1) ********
     
    ##1 Lexical Density
    # Create a list of unique words (content words)
    unique_words = list(set(words))

    # Calculate the number of content words in the text
    content_word_count = len(unique_words)

    # Calculate lexical density (content word count / total words)
    lexical_density = content_word_count / total_words if total_words > 0 else 0


    

    
    #******** 6. Textual Entropy stylometry features (1) ********
    
    ##1 Calculate textual entropy    
    # Calculate the frequency of each character
    char_freq = Counter(text)
    total_chars = len(text)
    entropy = -sum((count / total_chars) * math.log(count / total_chars, 2) for count in char_freq.values())
    
    return total_words, total_lines, empty_line_count, average_word_length, ratio_of_3, ratio_of_4, ratio_of_long_words, total_unique_words, punctuation_count, punctuation_ratio, comma_count, dashes_count, open_parentheses_count, close_parentheses_count, semicolons_count, white_spaces_count, question_marks_count, exclamation_marks_count, ampersands_count, percentage_signs, number_of_single_quotes, number_of_double_quotes, colons_count, number_of_characters_without_spaces, digit_count, no_of_all_brackets, total_sentences, average_sentence_length_in_words, average_sentence_length_in_char, min_sentence_length, max_sentence_length, average_spaces_per_sentence, total_question_sentences, total_paragraphs, average_paragraph_length, min_paragraph_length, max_paragraph_length, average_words_per_paragraph, average_sentences_per_paragraph, punctuation_ratio_per_para, average_question_marks_per_paragraph, lexical_density, entropy           

##### 3.1 Extract Stylometry Features at granularity level

In [9]:
# 1- Extract Stylometry Features of Para-Level Dataset

# list of all extracted features
stylometry_features_list_para_level = []

for row in dataset_para['Urdu Text']:
           
    ## resolving single \n and double \n problem
    # Replace '\n\n' with a space to join paragraphs
    text = row.replace('\n\n', '%%')

    # Replace '\n' with a space to join paragraphs
    text = text.replace('\n', '%%')

    # Split the text into paragraphs based on the space character
    paragraphs = text.split('%%')

    # Remove any empty spaces
    paragraphs = [p.strip() for p in paragraphs if p.strip()]

    # paragraphs which are large but dont endswith(-), - sign inserted at the end of those paragraphs
    # removing paragraphs which are small and dont endswith('-') and small bullet points

    paragraph_list = []
    for p in paragraphs:
        if len(p)<80: # small paragraphs excluded 
            continue
        elif (not p.endswith('۔') and not p.endswith('.') and not p.endswith('؟') and not p.endswith('؛')) and len(p)>150:
            paragraph_list.append(p + '۔') # large paragraphs without ending on -, handeled here
        elif (p.endswith('۔') or p.endswith('.') or p.endswith('؟') or p.endswith('؛')):
            paragraph_list.append(p) #paragraphs added to list
        else:
            continue

    text = '\n\n'.join(paragraph_list)  

    # Write the results to list
    stylometry_features_list_para_level.append(calculate_text_features(text))

In [10]:
# Store para-level dataset into .csv File and Dataframe

# Convert Stylometry Features list into Dataframe    
para_stylometry_features = pd.DataFrame(stylometry_features_list_para_level)

# adding header row accoring to the extracted stylometry feature
para_stylometry_features.columns = ['total_words', 'total_lines', 'empty_line_count', 'average_word_length', 'ratio_of_3', 'ratio_of_4', 'ratio_of_long_words', 'total_unique_words', 'punctuation_count', 'punctuation_ratio', 'comma_count', 'dashes_count', 'open_parentheses_count', 'close_parentheses_count', 'semicolons_count', 'white_spaces_count', 'question_marks_count', 'exclamation_marks_count', 'ampersands_count', 'percentage_signs', 'number_of_single_quotes', 'number_of_double_quotes', 'colons_count', 'number_of_characters_without_spaces', 'digit_count', 'no_of_all_brackets', 'total_sentences', 'average_sentence_length_in_words', 'average_sentence_length_in_char', 'min_sentence_length', 'max_sentence_length', 'average_spaces_per_sentence', 'total_question_sentences', 'total_paragraphs', 'average_paragraph_length', 'min_paragraph_length', 'max_paragraph_length', 'average_words_per_paragraph', 'average_sentences_per_paragraph', 'punctuation_ratio_per_para', 'average_question_marks_per_paragraph', 'lexical_density', 'entropy']

# Combine the Input Feature Vectors and Output Label
para_stylometry_features = pd.concat([dataset_para[['File Names']], para_stylometry_features, dataset_para[['Label']]], axis=1)

# store results into .csv file
para_stylometry_features.to_csv(r'para_stylometry_features.csv', index = False, header=True)

# Display all Stylometric Features of Dataset
print("\nStylometry Features of Para-Level Dataset :")
print("=============================================\n")
para_stylometry_features


Stylometry Features of Para-Level Dataset :



Unnamed: 0,File Names,total_words,total_lines,empty_line_count,average_word_length,ratio_of_3,ratio_of_4,ratio_of_long_words,total_unique_words,punctuation_count,...,average_paragraph_length,min_paragraph_length,max_paragraph_length,average_words_per_paragraph,average_sentences_per_paragraph,punctuation_ratio_per_para,average_question_marks_per_paragraph,lexical_density,entropy,Label
0,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1610,25,12,3.690062,0.24,0.22,0.02,753,87,...,579.846154,396,796,123.846154,127.538462,0.054037,0.000000,0.467702,4.427147,Non-Plagiarized
1,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1253,17,8,3.492418,0.26,0.23,0.01,544,84,...,624.444444,148,1777,139.222222,139.888889,0.067039,0.000000,0.434158,4.375206,Plagiarized
2,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1470,27,13,3.560544,0.25,0.24,0.01,556,88,...,478.000000,196,802,105.000000,106.714286,0.059864,0.428571,0.378231,4.411654,Non-Plagiarized
3,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,862,29,14,3.474478,0.26,0.23,0.01,377,44,...,256.133333,152,718,57.466667,57.533333,0.051044,0.000000,0.437355,4.387582,Plagiarized
4,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1261,1,0,3.596352,0.24,0.23,0.02,508,111,...,5833.000000,5833,5833,1261.000000,1341.000000,0.088025,15.000000,0.402855,4.557093,Non-Plagiarized
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5431,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1529,21,10,3.578810,0.25,0.21,0.01,661,64,...,635.454545,162,1022,139.000000,139.272727,0.041857,0.000000,0.432309,4.383694,Plagiarized
5432,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1556,25,12,3.475578,0.23,0.25,0.01,691,115,...,534.692308,109,772,119.692308,120.615385,0.073907,1.384615,0.444087,4.386175,Non-Plagiarized
5433,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,814,13,6,3.523342,0.23,0.21,0.02,388,73,...,525.000000,224,1728,116.285714,117.714286,0.089681,0.000000,0.476658,4.431409,Plagiarized
5434,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1827,25,12,3.550082,0.26,0.24,0.01,631,14,...,638.769231,318,880,140.538462,140.307692,0.007663,0.000000,0.345375,4.423068,Non-Plagiarized


In [11]:
# 2- Extract Stylometry Features of Sentence-Level Dataset

# list of all extracted features
stylometry_features_list_sentence_level = []

for row in dataset_sen['Urdu Text']:
           
    ## resolving single \n and double \n problem
    # Replace '\n\n' with a space to join paragraphs
    text = row.replace('\n\n', '%%')

    # Replace '\n' with a space to join paragraphs
    text = text.replace('\n', '%%')

    # Split the text into paragraphs based on the space character
    paragraphs = text.split('%%')

    # Remove any empty spaces
    paragraphs = [p.strip() for p in paragraphs if p.strip()]

    # paragraphs which are large but dont endswith(-), - sign inserted at the end of those paragraphs
    # removing paragraphs which are small and dont endswith('-') and small bullet points

    paragraph_list = []
    for p in paragraphs:
        if len(p)<80: # small paragraphs excluded 
            continue
        elif (not p.endswith('۔') and not p.endswith('.') and not p.endswith('؟') and not p.endswith('؛')) and len(p)>150:
            paragraph_list.append(p + '۔') # large paragraphs without ending on -, handeled here
        elif (p.endswith('۔') or p.endswith('.') or p.endswith('؟') or p.endswith('؛')):
            paragraph_list.append(p) #paragraphs added to list
        else:
            continue

    text = '\n\n'.join(paragraph_list)  

    # Write the results to list
    stylometry_features_list_sentence_level.append(calculate_text_features(text))

In [12]:
# Store Sen-level Dataset into .csv File and Dataframe

# Convert Stylometry Features list into Dataframe    
sen_stylometry_features = pd.DataFrame(stylometry_features_list_sentence_level)

# adding header row accoring to the extracted stylometry feature
sen_stylometry_features.columns = ['total_words', 'total_lines', 'empty_line_count', 'average_word_length', 'ratio_of_3', 'ratio_of_4', 'ratio_of_long_words', 'total_unique_words', 'punctuation_count', 'punctuation_ratio', 'comma_count', 'dashes_count', 'open_parentheses_count', 'close_parentheses_count', 'semicolons_count', 'white_spaces_count', 'question_marks_count', 'exclamation_marks_count', 'ampersands_count', 'percentage_signs', 'number_of_single_quotes', 'number_of_double_quotes', 'colons_count', 'number_of_characters_without_spaces', 'digit_count', 'no_of_all_brackets', 'total_sentences', 'average_sentence_length_in_words', 'average_sentence_length_in_char', 'min_sentence_length', 'max_sentence_length', 'average_spaces_per_sentence', 'total_question_sentences', 'total_paragraphs', 'average_paragraph_length', 'min_paragraph_length', 'max_paragraph_length', 'average_words_per_paragraph', 'average_sentences_per_paragraph', 'punctuation_ratio_per_para', 'average_question_marks_per_paragraph', 'lexical_density', 'entropy']

# Combine the Input Feature Vectors and Output Label
sen_stylometry_features = pd.concat([dataset_sen[['File Names']], sen_stylometry_features, dataset_sen[['Label']]], axis=1)

# store results into .csv file
sen_stylometry_features.to_csv(r'sen_stylometry_features.csv', index = False, header=True)

# Display all Stylometric Features of Dataset
print("\nStylometry Features of Sen-Level Dataset :")
print("============================================\n")
sen_stylometry_features


Stylometry Features of Sen-Level Dataset :



Unnamed: 0,File Names,total_words,total_lines,empty_line_count,average_word_length,ratio_of_3,ratio_of_4,ratio_of_long_words,total_unique_words,punctuation_count,...,average_paragraph_length,min_paragraph_length,max_paragraph_length,average_words_per_paragraph,average_sentences_per_paragraph,punctuation_ratio_per_para,average_question_marks_per_paragraph,lexical_density,entropy,Label
0,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1147,29,14,3.682650,0.24,0.22,0.02,531,87,...,357.133333,244,480,76.466667,78.266667,0.075850,0.133333,0.462947,4.437910,Non-Plagiarized
1,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1166,13,6,3.484563,0.26,0.23,0.01,526,77,...,746.000000,316,1769,166.571429,167.285714,0.066038,0.000000,0.451115,4.375210,Plagiarized
2,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1261,11,5,3.762887,0.23,0.25,0.02,571,100,...,1000.000000,475,1710,210.166667,213.833333,0.079302,0.166667,0.452815,4.452497,Non-Plagiarized
3,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,612,25,12,3.508170,0.25,0.24,0.00,266,30,...,211.230769,162,305,47.076923,47.153846,0.049020,0.000000,0.434641,4.385417,Plagiarized
4,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1143,21,10,3.381452,0.27,0.20,0.01,442,49,...,454.272727,270,694,103.909091,104.090909,0.042870,0.000000,0.386702,4.331698,Non-Plagiarized
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5431,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1522,19,9,3.578844,0.25,0.21,0.01,662,64,...,695.900000,265,1153,152.200000,152.600000,0.042050,0.000000,0.434954,4.389411,Plagiarized
5432,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1007,1,0,3.524330,0.26,0.20,0.02,430,52,...,4563.000000,4563,4563,1007.000000,1008.000000,0.051639,1.000000,0.427011,4.282201,Non-Plagiarized
5433,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,799,11,5,3.508135,0.24,0.20,0.02,381,69,...,599.333333,253,1728,133.166667,134.333333,0.086358,0.000000,0.476846,4.422597,Plagiarized
5434,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1256,15,7,3.661624,0.24,0.23,0.02,561,87,...,730.875000,257,1817,157.000000,159.250000,0.069268,0.875000,0.446656,4.497136,Non-Plagiarized


In [13]:
# 3- Extract Stylometry Features of Sentence+Para-Level Dataset

# list of all extracted features
stylometry_features_list_complete = []

for row in dataset_complete['Urdu Text']:
           
    ## resolving single \n and double \n problem
    # Replace '\n\n' with a space to join paragraphs
    text = row.replace('\n\n', '%%')

    # Replace '\n' with a space to join paragraphs
    text = text.replace('\n', '%%')

    # Split the text into paragraphs based on the space character
    paragraphs = text.split('%%')

    # Remove any empty spaces
    paragraphs = [p.strip() for p in paragraphs if p.strip()]

    # paragraphs which are large but dont endswith(-), - sign inserted at the end of those paragraphs
    # removing paragraphs which are small and dont endswith('-') and small bullet points

    paragraph_list = []
    for p in paragraphs:
        if len(p)<80: # small paragraphs excluded 
            continue
        elif (not p.endswith('۔') and not p.endswith('.') and not p.endswith('؟') and not p.endswith('؛')) and len(p)>150:
            paragraph_list.append(p + '۔') # large paragraphs without ending on -, handeled here
        elif (p.endswith('۔') or p.endswith('.') or p.endswith('؟') or p.endswith('؛')):
            paragraph_list.append(p) #paragraphs added to list
        else:
            continue

    text = '\n\n'.join(paragraph_list)  

    # Write the results to list
    stylometry_features_list_complete.append(calculate_text_features(text))

In [14]:
# Store Sen+Para-Level dataset into .csv File and Dataframe

# Convert Stylometry Features list into Dataframe    
all_stylometry_features = pd.DataFrame(stylometry_features_list_complete)

# adding header row accoring to the extracted stylometry feature
all_stylometry_features.columns = ['total_words', 'total_lines', 'empty_line_count', 'average_word_length', 'ratio_of_3', 'ratio_of_4', 'ratio_of_long_words', 'total_unique_words', 'punctuation_count', 'punctuation_ratio', 'comma_count', 'dashes_count', 'open_parentheses_count', 'close_parentheses_count', 'semicolons_count', 'white_spaces_count', 'question_marks_count', 'exclamation_marks_count', 'ampersands_count', 'percentage_signs', 'number_of_single_quotes', 'number_of_double_quotes', 'colons_count', 'number_of_characters_without_spaces', 'digit_count', 'no_of_all_brackets', 'total_sentences', 'average_sentence_length_in_words', 'average_sentence_length_in_char', 'min_sentence_length', 'max_sentence_length', 'average_spaces_per_sentence', 'total_question_sentences', 'total_paragraphs', 'average_paragraph_length', 'min_paragraph_length', 'max_paragraph_length', 'average_words_per_paragraph', 'average_sentences_per_paragraph', 'punctuation_ratio_per_para', 'average_question_marks_per_paragraph', 'lexical_density', 'entropy']

# Combine the Input Feature Vectors and Output Label
all_stylometry_features = pd.concat([dataset_complete[['File Names']], all_stylometry_features, dataset_complete[['Label']]], axis=1)

# store results into .csv file
all_stylometry_features.to_csv(r'all_stylometry_features.csv', index = False, header=True)

# Display all Stylometric Features of Dataset
print("\nAll Stylometry Features of Complete Dataset :")
print("================================================\n")
all_stylometry_features


All Stylometry Features of Complete Dataset :



Unnamed: 0,File Names,total_words,total_lines,empty_line_count,average_word_length,ratio_of_3,ratio_of_4,ratio_of_long_words,total_unique_words,punctuation_count,...,average_paragraph_length,min_paragraph_length,max_paragraph_length,average_words_per_paragraph,average_sentences_per_paragraph,punctuation_ratio_per_para,average_question_marks_per_paragraph,lexical_density,entropy,Label
0,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1147,29,14,3.682650,0.24,0.22,0.02,531,87,...,357.133333,244,480,76.466667,78.266667,0.075850,0.133333,0.462947,4.437910,Non-Plagiarized
1,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1166,13,6,3.484563,0.26,0.23,0.01,526,77,...,746.000000,316,1769,166.571429,167.285714,0.066038,0.000000,0.451115,4.375210,Plagiarized
2,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1261,11,5,3.762887,0.23,0.25,0.02,571,100,...,1000.000000,475,1710,210.166667,213.833333,0.079302,0.166667,0.452815,4.452497,Non-Plagiarized
3,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,612,25,12,3.508170,0.25,0.24,0.00,266,30,...,211.230769,162,305,47.076923,47.153846,0.049020,0.000000,0.434641,4.385417,Plagiarized
4,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1143,21,10,3.381452,0.27,0.20,0.01,442,49,...,454.272727,270,694,103.909091,104.090909,0.042870,0.000000,0.386702,4.331698,Non-Plagiarized
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10867,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1529,21,10,3.578810,0.25,0.21,0.01,661,64,...,635.454545,162,1022,139.000000,139.272727,0.041857,0.000000,0.432309,4.383694,Plagiarized
10868,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1556,25,12,3.475578,0.23,0.25,0.01,691,115,...,534.692308,109,772,119.692308,120.615385,0.073907,1.384615,0.444087,4.386175,Non-Plagiarized
10869,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,814,13,6,3.523342,0.23,0.21,0.02,388,73,...,525.000000,224,1728,116.285714,117.714286,0.089681,0.000000,0.476658,4.431409,Plagiarized
10870,C:\\Users\\hasee\\Desktop\\baby projects\\0 Da...,1827,25,12,3.550082,0.26,0.24,0.01,631,14,...,638.769231,318,880,140.538462,140.307692,0.007663,0.000000,0.345375,4.423068,Non-Plagiarized


In [15]:
# drop column of file names
sen_level_Dataset_features = sen_stylometry_features.drop(['File Names'], axis=1)
para_level_Dataset_features = para_stylometry_features.drop(['File Names'], axis=1)
all_level_Dataset_features = all_stylometry_features.drop(['File Names'], axis=1)

### 4. Apply Machine Learning Algorithms

In [16]:
# convert Sen-Level Label column values into digits
label_encoder = LabelEncoder()
sen_level_Dataset_features['Label'] = label_encoder.fit_transform(sen_level_Dataset_features['Label'])

In [17]:
# convert Para-Level Label column values into digits
para_level_Dataset_features['Label'] = label_encoder.fit_transform(para_level_Dataset_features['Label'])

In [18]:
# convert Sen+Para-Level Label column values into digits
all_level_Dataset_features['Label'] = label_encoder.fit_transform(all_level_Dataset_features['Label'])

### Lets apply 10 ML Agorithms given below
i.Decison Tree
ii. Logistic Regression
iii. Random Forest
iv. Support Vector Machines (SVM)
v. K-Nearest Neighbors (KNN)
vi. Naive Bayes
vii. Gradient Boosting Machines
viii. Linear Discriminant Analysis (LDA)
ix. Ensemble Learning (Voting Classifier)

##### 4.1 Apply Machine Learning Algorithms on 1. Sen-Level Dataset

###### 4.1.1. All stylometry features (43)

In [61]:
#split data into training and testing

X = sen_level_Dataset_features.drop(['Label'], axis=1)  # Features
y = sen_level_Dataset_features['Label']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [62]:
# Convert X_train and X_test to NumPy arrays
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# Ensure the arrays are contiguous
X_train_np = np.ascontiguousarray(X_train_np)
X_test_np = np.ascontiguousarray(X_test_np)


# Initialize classifiers
logreg = LogisticRegression()
dtree = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
knn = KNeighborsClassifier()
nb = GaussianNB()
lda = LinearDiscriminantAnalysis()
mlp = MLPClassifier(max_iter=1000)  # Adjust max_iter based on convergence
xgb = GradientBoostingClassifier()

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('logreg', logreg),
    ('dtree', dtree),
    ('rf', rf),
    ('svm', svm),
    ('knn', knn),
    ('nb', nb),
    ('lda', lda),
    ('mlp', mlp),
    ('xgb', xgb)
], voting='hard')

# Train and evaluate each classifier
classifiers = [logreg, dtree, rf, svm, knn, nb, lda, mlp, xgb, voting_clf]
print('\033[1;4mApply Machine Learning Algorithms on 1. Sen-Level Dataset and All stylometry features\n\033[0m')

for i,clf in enumerate(classifiers):
    clf.fit(X_train_np, y_train)
    y_pred = clf.predict(X_test_np)
    print(f"\033[4m{i+1}: Classifier Name: {clf.__class__.__name__}\033[0m\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nprecision: {precision_score(y_test,y_pred):.4f}\nrecall: {recall_score(y_test,y_pred):.4f}\nf1: {f1_score(y_test,y_pred):.4f}\nConfusion Matrix:\n{confusion_matrix(y_test,y_pred)}\n")

[1;4mApply Machine Learning Algorithms on 1. Sen-Level Dataset and All stylometry features
[0m
[4m1: Classifier Name: LogisticRegression[0m
Accuracy: 0.8318
precision: 0.8444
recall: 0.8150
f1: 0.8295
Confusion Matrix:
[[460  82]
 [101 445]]

[4m2: Classifier Name: DecisionTreeClassifier[0m
Accuracy: 0.9825
precision: 0.9714
recall: 0.9945
f1: 0.9828
Confusion Matrix:
[[526  16]
 [  3 543]]

[4m3: Classifier Name: RandomForestClassifier[0m
Accuracy: 0.9991
precision: 0.9982
recall: 1.0000
f1: 0.9991
Confusion Matrix:
[[541   1]
 [  0 546]]

[4m4: Classifier Name: SVC[0m
Accuracy: 0.8382
precision: 0.9322
recall: 0.7308
f1: 0.8193
Confusion Matrix:
[[513  29]
 [147 399]]

[4m5: Classifier Name: KNeighborsClassifier[0m
Accuracy: 0.9439
precision: 0.9131
recall: 0.9817
f1: 0.9462
Confusion Matrix:
[[491  51]
 [ 10 536]]

[4m6: Classifier Name: GaussianNB[0m
Accuracy: 0.7518
precision: 0.9367
recall: 0.5421
f1: 0.6868
Confusion Matrix:
[[522  20]
 [250 296]]

[4m7: Classifie

###### 4.1.2. Word-Level stylometry features (10)

In [63]:
#split data into training and testing

X = sen_level_Dataset_features.drop(['Label'], axis=1)  # Features
X = X.iloc[:, 0:10] #Word-Level stylometry features 
y = sen_level_Dataset_features['Label']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [64]:
# Convert X_train and X_test to NumPy arrays
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# Ensure the arrays are contiguous
X_train_np = np.ascontiguousarray(X_train_np)
X_test_np = np.ascontiguousarray(X_test_np)


# Initialize classifiers
logreg = LogisticRegression()
dtree = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
knn = KNeighborsClassifier()
nb = GaussianNB()
lda = LinearDiscriminantAnalysis()
mlp = MLPClassifier(max_iter=1000)  # Adjust max_iter based on convergence
xgb = GradientBoostingClassifier()

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('logreg', logreg),
    ('dtree', dtree),
    ('rf', rf),
    ('svm', svm),
    ('knn', knn),
    ('nb', nb),
    ('lda', lda),
    ('mlp', mlp),
    ('xgb', xgb)
], voting='hard')

# Train and evaluate each classifier
classifiers = [logreg, dtree, rf, svm, knn, nb, lda, mlp, xgb, voting_clf]
print('\033[1;4mApply Machine Learning Algorithms on 1. Sen-Level Dataset and Word-Level Stylometry features\n\033[0m')

for i,clf in enumerate(classifiers):
    clf.fit(X_train_np, y_train)
    y_pred = clf.predict(X_test_np)
    print(f"\033[4m{i+1}: Classifier Name: {clf.__class__.__name__}\033[0m\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nprecision: {precision_score(y_test,y_pred):.4f}\nrecall: {recall_score(y_test,y_pred):.4f}\nf1: {f1_score(y_test,y_pred):.4f}\nConfusion Matrix:\n{confusion_matrix(y_test,y_pred)}\n")

[1;4mApply Machine Learning Algorithms on 1. Sen-Level Dataset and Word-Level Stylometry features
[0m
[4m1: Classifier Name: LogisticRegression[0m
Accuracy: 0.7040
precision: 0.7146
recall: 0.6832
f1: 0.6985
Confusion Matrix:
[[393 149]
 [173 373]]

[4m2: Classifier Name: DecisionTreeClassifier[0m
Accuracy: 0.9715
precision: 0.9574
recall: 0.9872
f1: 0.9720
Confusion Matrix:
[[518  24]
 [  7 539]]

[4m3: Classifier Name: RandomForestClassifier[0m
Accuracy: 0.9936
precision: 0.9909
recall: 0.9963
f1: 0.9936
Confusion Matrix:
[[537   5]
 [  2 544]]

[4m4: Classifier Name: SVC[0m
Accuracy: 0.7895
precision: 0.9594
recall: 0.6062
f1: 0.7430
Confusion Matrix:
[[528  14]
 [215 331]]

[4m5: Classifier Name: KNeighborsClassifier[0m
Accuracy: 0.9301
precision: 0.8997
recall: 0.9689
f1: 0.9330
Confusion Matrix:
[[483  59]
 [ 17 529]]

[4m6: Classifier Name: GaussianNB[0m
Accuracy: 0.7601
precision: 0.9331
recall: 0.5623
f1: 0.7017
Confusion Matrix:
[[520  22]
 [239 307]]

[4m7: Cl

###### 4.1.3. Character-Level stylometry features (16)

In [65]:
#split data into training and testing

X = sen_level_Dataset_features.drop(['Label'], axis=1)  # Features
X = X.iloc[:, 10:26] #Character-Level stylometry features 
y = sen_level_Dataset_features['Label']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [66]:
# Convert X_train and X_test to NumPy arrays
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# Ensure the arrays are contiguous
X_train_np = np.ascontiguousarray(X_train_np)
X_test_np = np.ascontiguousarray(X_test_np)


# Initialize classifiers
logreg = LogisticRegression()
dtree = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
knn = KNeighborsClassifier()
nb = GaussianNB()
lda = LinearDiscriminantAnalysis()
mlp = MLPClassifier(max_iter=1000)  # Adjust max_iter based on convergence
xgb = GradientBoostingClassifier()

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('logreg', logreg),
    ('dtree', dtree),
    ('rf', rf),
    ('svm', svm),
    ('knn', knn),
    ('nb', nb),
    ('lda', lda),
    ('mlp', mlp),
    ('xgb', xgb)
], voting='hard')

# Train and evaluate each classifier
classifiers = [logreg, dtree, rf, svm, knn, nb, lda, mlp, xgb, voting_clf]
print('\033[1;4mApply Machine Learning Algorithms on 1. Sen-Level Dataset and Character-Level Stylometry features\n\033[0m')

for i,clf in enumerate(classifiers):
    clf.fit(X_train_np, y_train)
    y_pred = clf.predict(X_test_np)
    print(f"\033[4m{i+1}: Classifier Name: {clf.__class__.__name__}\033[0m\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nprecision: {precision_score(y_test,y_pred):.4f}\nrecall: {recall_score(y_test,y_pred):.4f}\nf1: {f1_score(y_test,y_pred):.4f}\nConfusion Matrix:\n{confusion_matrix(y_test,y_pred)}\n")

[1;4mApply Machine Learning Algorithms on 1. Sen-Level Dataset and Character-Level Stylometry features
[0m
[4m1: Classifier Name: LogisticRegression[0m
Accuracy: 0.7776
precision: 0.7957
recall: 0.7491
f1: 0.7717
Confusion Matrix:
[[437 105]
 [137 409]]

[4m2: Classifier Name: DecisionTreeClassifier[0m
Accuracy: 0.9706
precision: 0.9622
recall: 0.9799
f1: 0.9710
Confusion Matrix:
[[521  21]
 [ 11 535]]

[4m3: Classifier Name: RandomForestClassifier[0m
Accuracy: 0.9862
precision: 0.9854
recall: 0.9872
f1: 0.9863
Confusion Matrix:
[[534   8]
 [  7 539]]

[4m4: Classifier Name: SVC[0m
Accuracy: 0.7904
precision: 0.9649
recall: 0.6044
f1: 0.7432
Confusion Matrix:
[[530  12]
 [216 330]]

[4m5: Classifier Name: KNeighborsClassifier[0m
Accuracy: 0.9072
precision: 0.8803
recall: 0.9432
f1: 0.9107
Confusion Matrix:
[[472  70]
 [ 31 515]]

[4m6: Classifier Name: GaussianNB[0m
Accuracy: 0.7233
precision: 0.9554
recall: 0.4707
f1: 0.6307
Confusion Matrix:
[[530  12]
 [289 257]]

[4m

###### 4.1.4. Sentence-Level stylometry features (7)

In [67]:
#split data into training and testing

X = sen_level_Dataset_features.drop(['Label'], axis=1)  # Features
X = X.iloc[:, 26:33] #Sentence-Level stylometry features 
y = sen_level_Dataset_features['Label']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [68]:
# Convert X_train and X_test to NumPy arrays
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# Ensure the arrays are contiguous
X_train_np = np.ascontiguousarray(X_train_np)
X_test_np = np.ascontiguousarray(X_test_np)


# Initialize classifiers
logreg = LogisticRegression()
dtree = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
knn = KNeighborsClassifier()
nb = GaussianNB()
lda = LinearDiscriminantAnalysis()
mlp = MLPClassifier(max_iter=1000)  # Adjust max_iter based on convergence
xgb = GradientBoostingClassifier()

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('logreg', logreg),
    ('dtree', dtree),
    ('rf', rf),
    ('svm', svm),
    ('knn', knn),
    ('nb', nb),
    ('lda', lda),
    ('mlp', mlp),
    ('xgb', xgb)
], voting='hard')

# Train and evaluate each classifier
classifiers = [logreg, dtree, rf, svm, knn, nb, lda, mlp, xgb, voting_clf]
print('\033[1;4mApply Machine Learning Algorithms on 1. Sen-Level Dataset and Sentence-Level Stylometry features\n\033[0m')

for i,clf in enumerate(classifiers):
    clf.fit(X_train_np, y_train)
    y_pred = clf.predict(X_test_np)
    print(f"\033[4m{i+1}: Classifier Name: {clf.__class__.__name__}\033[0m\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nprecision: {precision_score(y_test,y_pred):.4f}\nrecall: {recall_score(y_test,y_pred):.4f}\nf1: {f1_score(y_test,y_pred):.4f}\nConfusion Matrix:\n{confusion_matrix(y_test,y_pred)}\n")

[1;4mApply Machine Learning Algorithms on 1. Sen-Level Dataset and Sentence-Level Stylometry features
[0m
[4m1: Classifier Name: LogisticRegression[0m
Accuracy: 0.5708
precision: 0.5776
recall: 0.5385
f1: 0.5573
Confusion Matrix:
[[327 215]
 [252 294]]

[4m2: Classifier Name: DecisionTreeClassifier[0m
Accuracy: 0.9596
precision: 0.9580
recall: 0.9615
f1: 0.9598
Confusion Matrix:
[[519  23]
 [ 21 525]]

[4m3: Classifier Name: RandomForestClassifier[0m
Accuracy: 0.9715
precision: 0.9656
recall: 0.9780
f1: 0.9718
Confusion Matrix:
[[523  19]
 [ 12 534]]

[4m4: Classifier Name: SVC[0m
Accuracy: 0.7243
precision: 0.8555
recall: 0.5421
f1: 0.6637
Confusion Matrix:
[[492  50]
 [250 296]]

[4m5: Classifier Name: KNeighborsClassifier[0m
Accuracy: 0.9228
precision: 0.8915
recall: 0.9634
f1: 0.9261
Confusion Matrix:
[[478  64]
 [ 20 526]]

[4m6: Classifier Name: GaussianNB[0m
Accuracy: 0.5836
precision: 0.7541
recall: 0.2527
f1: 0.3786
Confusion Matrix:
[[497  45]
 [408 138]]

[4m7

###### 4.1.5. Para-Level stylometry features, Document-Level stylometry features, Textual Entropy stylometry features (10)

In [69]:
#split data into training and testing

X = sen_level_Dataset_features.drop(['Label'], axis=1)  # Features
X = X.iloc[:, 33:]
y = sen_level_Dataset_features['Label']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
# Convert X_train and X_test to NumPy arrays
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# Ensure the arrays are contiguous
X_train_np = np.ascontiguousarray(X_train_np)
X_test_np = np.ascontiguousarray(X_test_np)


# Initialize classifiers
logreg = LogisticRegression()
dtree = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
knn = KNeighborsClassifier()
nb = GaussianNB()
lda = LinearDiscriminantAnalysis()
mlp = MLPClassifier(max_iter=1000)  # Adjust max_iter based on convergence
xgb = GradientBoostingClassifier()

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('logreg', logreg),
    ('dtree', dtree),
    ('rf', rf),
    ('svm', svm),
    ('knn', knn),
    ('nb', nb),
    ('lda', lda),
    ('mlp', mlp),
    ('xgb', xgb)
], voting='hard')

# Train and evaluate each classifier
classifiers = [logreg, dtree, rf, svm, knn, nb, lda, mlp, xgb, voting_clf]
print('\033[1;4mApply Machine Learning Algorithms on 1. Sen-Level Dataset and Para-Level , Document-Level, Textual Entropy Stylometry features\n\033[0m')

for i,clf in enumerate(classifiers):
    clf.fit(X_train_np, y_train)
    y_pred = clf.predict(X_test_np)
    print(f"\033[4m{i+1}: Classifier Name: {clf.__class__.__name__}\033[0m\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nprecision: {precision_score(y_test,y_pred):.4f}\nrecall: {recall_score(y_test,y_pred):.4f}\nf1: {f1_score(y_test,y_pred):.4f}\nConfusion Matrix:\n{confusion_matrix(y_test,y_pred)}\n")

[1;4mApply Machine Learning Algorithms on 1. Sen-Level Dataset and Para-Level , Document-Level, Textual Entropy Stylometry features
[0m
[4m1: Classifier Name: LogisticRegression[0m
Accuracy: 0.6792
precision: 0.7627
recall: 0.5238
f1: 0.6211
Confusion Matrix:
[[453  89]
 [260 286]]

[4m2: Classifier Name: DecisionTreeClassifier[0m
Accuracy: 0.9568
precision: 0.9416
recall: 0.9744
f1: 0.9577
Confusion Matrix:
[[509  33]
 [ 14 532]]

[4m3: Classifier Name: RandomForestClassifier[0m
Accuracy: 0.9853
precision: 0.9836
recall: 0.9872
f1: 0.9854
Confusion Matrix:
[[533   9]
 [  7 539]]

[4m4: Classifier Name: SVC[0m
Accuracy: 0.7619
precision: 0.8345
recall: 0.6557
f1: 0.7344
Confusion Matrix:
[[471  71]
 [188 358]]

[4m5: Classifier Name: KNeighborsClassifier[0m
Accuracy: 0.8796
precision: 0.8571
recall: 0.9121
f1: 0.8838
Confusion Matrix:
[[459  83]
 [ 48 498]]

[4m6: Classifier Name: GaussianNB[0m
Accuracy: 0.6461
precision: 0.8286
recall: 0.3718
f1: 0.5133
Confusion Matrix:

##### 4.2 Apply Machine Learning Algorithms on 2. Para-Level Dataset

###### 4.2.1. All stylometry features (43)

In [71]:
#split data into training and testing

X = para_level_Dataset_features.drop(['Label'], axis=1)  # Features
y = para_level_Dataset_features['Label']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [72]:
# Convert X_train and X_test to NumPy arrays
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# Ensure the arrays are contiguous
X_train_np = np.ascontiguousarray(X_train_np)
X_test_np = np.ascontiguousarray(X_test_np)


# Initialize classifiers
logreg = LogisticRegression()
dtree = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
knn = KNeighborsClassifier()
nb = GaussianNB()
lda = LinearDiscriminantAnalysis()
mlp = MLPClassifier(max_iter=1000)  # Adjust max_iter based on convergence
xgb = GradientBoostingClassifier()

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('logreg', logreg),
    ('dtree', dtree),
    ('rf', rf),
    ('svm', svm),
    ('knn', knn),
    ('nb', nb),
    ('lda', lda),
    ('mlp', mlp),
    ('xgb', xgb)
], voting='hard')

# Train and evaluate each classifier
classifiers = [logreg, dtree, rf, svm, knn, nb, lda, mlp, xgb, voting_clf]
print('\033[1;4mApply Machine Learning Algorithms on 2. Para-Level Dataset and All stylometry features\n\033[0m')

for i,clf in enumerate(classifiers):
    clf.fit(X_train_np, y_train)
    y_pred = clf.predict(X_test_np)
    print(f"\033[4m{i+1}: Classifier Name: {clf.__class__.__name__}\033[0m\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nprecision: {precision_score(y_test,y_pred):.4f}\nrecall: {recall_score(y_test,y_pred):.4f}\nf1: {f1_score(y_test,y_pred):.4f}\nConfusion Matrix:\n{confusion_matrix(y_test,y_pred)}\n")

[1;4mApply Machine Learning Algorithms on 2. Para-Level Dataset and All stylometry features
[0m
[4m1: Classifier Name: LogisticRegression[0m
Accuracy: 0.8382
precision: 0.8599
recall: 0.8095
f1: 0.8340
Confusion Matrix:
[[470  72]
 [104 442]]

[4m2: Classifier Name: DecisionTreeClassifier[0m
Accuracy: 0.9669
precision: 0.9620
recall: 0.9725
f1: 0.9672
Confusion Matrix:
[[521  21]
 [ 15 531]]

[4m3: Classifier Name: RandomForestClassifier[0m
Accuracy: 0.9881
precision: 0.9890
recall: 0.9872
f1: 0.9881
Confusion Matrix:
[[536   6]
 [  7 539]]

[4m4: Classifier Name: SVC[0m
Accuracy: 0.8419
precision: 0.8864
recall: 0.7857
f1: 0.8330
Confusion Matrix:
[[487  55]
 [117 429]]

[4m5: Classifier Name: KNeighborsClassifier[0m
Accuracy: 0.9154
precision: 0.8861
recall: 0.9542
f1: 0.9189
Confusion Matrix:
[[475  67]
 [ 25 521]]

[4m6: Classifier Name: GaussianNB[0m
Accuracy: 0.7638
precision: 0.9392
recall: 0.5659
f1: 0.7063
Confusion Matrix:
[[522  20]
 [237 309]]

[4m7: Classifi

###### 4.2.2. Word-Level stylometry features (10)

In [73]:
#split data into training and testing

X = para_level_Dataset_features.drop(['Label'], axis=1)  # Features
X = X.iloc[:, 0:10] #Word-Level stylometry features 
y = para_level_Dataset_features['Label']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [74]:
# Convert X_train and X_test to NumPy arrays
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# Ensure the arrays are contiguous
X_train_np = np.ascontiguousarray(X_train_np)
X_test_np = np.ascontiguousarray(X_test_np)


# Initialize classifiers
logreg = LogisticRegression()
dtree = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
knn = KNeighborsClassifier()
nb = GaussianNB()
lda = LinearDiscriminantAnalysis()
mlp = MLPClassifier(max_iter=1000)  # Adjust max_iter based on convergence
xgb = GradientBoostingClassifier()

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('logreg', logreg),
    ('dtree', dtree),
    ('rf', rf),
    ('svm', svm),
    ('knn', knn),
    ('nb', nb),
    ('lda', lda),
    ('mlp', mlp),
    ('xgb', xgb)
], voting='hard')

# Train and evaluate each classifier
classifiers = [logreg, dtree, rf, svm, knn, nb, lda, mlp, xgb, voting_clf]
print('\033[1;4mApply Machine Learning Algorithms on 2. Para-Level Dataset and Word-Level Stylometry features\n\033[0m')

for i,clf in enumerate(classifiers):
    clf.fit(X_train_np, y_train)
    y_pred = clf.predict(X_test_np)
    print(f"\033[4m{i+1}: Classifier Name: {clf.__class__.__name__}\033[0m\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nprecision: {precision_score(y_test,y_pred):.4f}\nrecall: {recall_score(y_test,y_pred):.4f}\nf1: {f1_score(y_test,y_pred):.4f}\nConfusion Matrix:\n{confusion_matrix(y_test,y_pred)}\n")

[1;4mApply Machine Learning Algorithms on 2. Para-Level Dataset and Word-Level Stylometry features
[0m
[4m1: Classifier Name: LogisticRegression[0m
Accuracy: 0.7096
precision: 0.7054
recall: 0.7234
f1: 0.7143
Confusion Matrix:
[[377 165]
 [151 395]]

[4m2: Classifier Name: DecisionTreeClassifier[0m
Accuracy: 0.9035
precision: 0.9002
recall: 0.9084
f1: 0.9043
Confusion Matrix:
[[487  55]
 [ 50 496]]

[4m3: Classifier Name: RandomForestClassifier[0m
Accuracy: 0.9550
precision: 0.9680
recall: 0.9414
f1: 0.9545
Confusion Matrix:
[[525  17]
 [ 32 514]]

[4m4: Classifier Name: SVC[0m
Accuracy: 0.7454
precision: 0.8768
recall: 0.5733
f1: 0.6932
Confusion Matrix:
[[498  44]
 [233 313]]

[4m5: Classifier Name: KNeighborsClassifier[0m
Accuracy: 0.8208
precision: 0.8117
recall: 0.8370
f1: 0.8242
Confusion Matrix:
[[436 106]
 [ 89 457]]

[4m6: Classifier Name: GaussianNB[0m
Accuracy: 0.6581
precision: 0.9065
recall: 0.3553
f1: 0.5105
Confusion Matrix:
[[522  20]
 [352 194]]

[4m7: C

###### 4.2.3. Character-Level stylometry features (16)

In [75]:
#split data into training and testing

X = para_level_Dataset_features.drop(['Label'], axis=1)  # Features
X = X.iloc[:, 10:26] #Character-Level stylometry features 
y = para_level_Dataset_features['Label']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [76]:
# Convert X_train and X_test to NumPy arrays
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# Ensure the arrays are contiguous
X_train_np = np.ascontiguousarray(X_train_np)
X_test_np = np.ascontiguousarray(X_test_np)


# Initialize classifiers
logreg = LogisticRegression()
dtree = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
knn = KNeighborsClassifier()
nb = GaussianNB()
lda = LinearDiscriminantAnalysis()
mlp = MLPClassifier(max_iter=1000)  # Adjust max_iter based on convergence
xgb = GradientBoostingClassifier()

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('logreg', logreg),
    ('dtree', dtree),
    ('rf', rf),
    ('svm', svm),
    ('knn', knn),
    ('nb', nb),
    ('lda', lda),
    ('mlp', mlp),
    ('xgb', xgb)
], voting='hard')

# Train and evaluate each classifier
classifiers = [logreg, dtree, rf, svm, knn, nb, lda, mlp, xgb, voting_clf]
print('\033[1;4mApply Machine Learning Algorithms on 2. Para-Level Dataset and Character-Level Stylometry features\n\033[0m')

for i,clf in enumerate(classifiers):
    clf.fit(X_train_np, y_train)
    y_pred = clf.predict(X_test_np)
    print(f"\033[4m{i+1}: Classifier Name: {clf.__class__.__name__}\033[0m\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nprecision: {precision_score(y_test,y_pred):.4f}\nrecall: {recall_score(y_test,y_pred):.4f}\nf1: {f1_score(y_test,y_pred):.4f}\nConfusion Matrix:\n{confusion_matrix(y_test,y_pred)}\n")

[1;4mApply Machine Learning Algorithms on 2. Para-Level Dataset and Character-Level Stylometry features
[0m
[4m1: Classifier Name: LogisticRegression[0m
Accuracy: 0.7592
precision: 0.7630
recall: 0.7546
f1: 0.7587
Confusion Matrix:
[[414 128]
 [134 412]]

[4m2: Classifier Name: DecisionTreeClassifier[0m
Accuracy: 0.9421
precision: 0.9399
recall: 0.9451
f1: 0.9425
Confusion Matrix:
[[509  33]
 [ 30 516]]

[4m3: Classifier Name: RandomForestClassifier[0m
Accuracy: 0.9651
precision: 0.9686
recall: 0.9615
f1: 0.9651
Confusion Matrix:
[[525  17]
 [ 21 525]]

[4m4: Classifier Name: SVC[0m
Accuracy: 0.7454
precision: 0.8645
recall: 0.5842
f1: 0.6973
Confusion Matrix:
[[492  50]
 [227 319]]

[4m5: Classifier Name: KNeighborsClassifier[0m
Accuracy: 0.8199
precision: 0.8038
recall: 0.8480
f1: 0.8253
Confusion Matrix:
[[429 113]
 [ 83 463]]

[4m6: Classifier Name: GaussianNB[0m
Accuracy: 0.6737
precision: 0.9401
recall: 0.3736
f1: 0.5347
Confusion Matrix:
[[529  13]
 [342 204]]

[4

###### 4.2.4. Sentence-Level stylometry features (7)

In [77]:
#split data into training and testing

X = para_level_Dataset_features.drop(['Label'], axis=1)  # Features
X = X.iloc[:, 26:33] #Sentence-Level stylometry features 
y = para_level_Dataset_features['Label']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [78]:
# Convert X_train and X_test to NumPy arrays
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# Ensure the arrays are contiguous
X_train_np = np.ascontiguousarray(X_train_np)
X_test_np = np.ascontiguousarray(X_test_np)


# Initialize classifiers
logreg = LogisticRegression()
dtree = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
knn = KNeighborsClassifier()
nb = GaussianNB()
lda = LinearDiscriminantAnalysis()
mlp = MLPClassifier(max_iter=1000)  # Adjust max_iter based on convergence
xgb = GradientBoostingClassifier()

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('logreg', logreg),
    ('dtree', dtree),
    ('rf', rf),
    ('svm', svm),
    ('knn', knn),
    ('nb', nb),
    ('lda', lda),
    ('mlp', mlp),
    ('xgb', xgb)
], voting='hard')

# Train and evaluate each classifier
classifiers = [logreg, dtree, rf, svm, knn, nb, lda, mlp, xgb, voting_clf]
print('\033[1;4mApply Machine Learning Algorithms on 2. Para-Level Dataset and Sentence-Level Stylometry features\n\033[0m')

for i,clf in enumerate(classifiers):
    clf.fit(X_train_np, y_train)
    y_pred = clf.predict(X_test_np)
    print(f"\033[4m{i+1}: Classifier Name: {clf.__class__.__name__}\033[0m\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nprecision: {precision_score(y_test,y_pred):.4f}\nrecall: {recall_score(y_test,y_pred):.4f}\nf1: {f1_score(y_test,y_pred):.4f}\nConfusion Matrix:\n{confusion_matrix(y_test,y_pred)}\n")

[1;4mApply Machine Learning Algorithms on 2. Para-Level Dataset and Sentence-Level Stylometry features
[0m
[4m1: Classifier Name: LogisticRegression[0m
Accuracy: 0.6314
precision: 0.6171
recall: 0.6996
f1: 0.6558
Confusion Matrix:
[[305 237]
 [164 382]]

[4m2: Classifier Name: DecisionTreeClassifier[0m
Accuracy: 0.8879
precision: 0.8813
recall: 0.8974
f1: 0.8893
Confusion Matrix:
[[476  66]
 [ 56 490]]

[4m3: Classifier Name: RandomForestClassifier[0m
Accuracy: 0.9228
precision: 0.9278
recall: 0.9176
f1: 0.9227
Confusion Matrix:
[[503  39]
 [ 45 501]]

[4m4: Classifier Name: SVC[0m
Accuracy: 0.7188
precision: 0.7703
recall: 0.6264
f1: 0.6909
Confusion Matrix:
[[440 102]
 [204 342]]

[4m5: Classifier Name: KNeighborsClassifier[0m
Accuracy: 0.8520
precision: 0.8302
recall: 0.8864
f1: 0.8574
Confusion Matrix:
[[443  99]
 [ 62 484]]

[4m6: Classifier Name: GaussianNB[0m
Accuracy: 0.6048
precision: 0.5695
recall: 0.8700
f1: 0.6884
Confusion Matrix:
[[183 359]
 [ 71 475]]

[4m

###### 4.2.5. Para-Level stylometry features, Document-Level stylometry features, Textual Entropy stylometry features (10)

In [79]:
#split data into training and testing

X = para_level_Dataset_features.drop(['Label'], axis=1)  # Features
X = X.iloc[:, 33:]
y = para_level_Dataset_features['Label']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [80]:
# Convert X_train and X_test to NumPy arrays
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# Ensure the arrays are contiguous
X_train_np = np.ascontiguousarray(X_train_np)
X_test_np = np.ascontiguousarray(X_test_np)


# Initialize classifiers
logreg = LogisticRegression()
dtree = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
knn = KNeighborsClassifier()
nb = GaussianNB()
lda = LinearDiscriminantAnalysis()
mlp = MLPClassifier(max_iter=1000)  # Adjust max_iter based on convergence
xgb = GradientBoostingClassifier()

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('logreg', logreg),
    ('dtree', dtree),
    ('rf', rf),
    ('svm', svm),
    ('knn', knn),
    ('nb', nb),
    ('lda', lda),
    ('mlp', mlp),
    ('xgb', xgb)
], voting='hard')

# Train and evaluate each classifier
classifiers = [logreg, dtree, rf, svm, knn, nb, lda, mlp, xgb, voting_clf]
print('\033[1;4mApply Machine Learning Algorithms on 2. Para-Level Dataset and Para-Level , Document-Level, Textual Entropy Stylometry features\n\033[0m')

for i,clf in enumerate(classifiers):
    clf.fit(X_train_np, y_train)
    y_pred = clf.predict(X_test_np)
    print(f"\033[4m{i+1}: Classifier Name: {clf.__class__.__name__}\033[0m\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nprecision: {precision_score(y_test,y_pred):.4f}\nrecall: {recall_score(y_test,y_pred):.4f}\nf1: {f1_score(y_test,y_pred):.4f}\nConfusion Matrix:\n{confusion_matrix(y_test,y_pred)}\n")

[1;4mApply Machine Learning Algorithms on 2. Para-Level Dataset and Para-Level , Document-Level, Textual Entropy Stylometry features
[0m
[4m1: Classifier Name: LogisticRegression[0m
Accuracy: 0.7629
precision: 0.7857
recall: 0.7253
f1: 0.7543
Confusion Matrix:
[[434 108]
 [150 396]]

[4m2: Classifier Name: DecisionTreeClassifier[0m
Accuracy: 0.9338
precision: 0.9232
recall: 0.9469
f1: 0.9349
Confusion Matrix:
[[499  43]
 [ 29 517]]

[4m3: Classifier Name: RandomForestClassifier[0m
Accuracy: 0.9688
precision: 0.9588
recall: 0.9799
f1: 0.9692
Confusion Matrix:
[[519  23]
 [ 11 535]]

[4m4: Classifier Name: SVC[0m
Accuracy: 0.7647
precision: 0.8326
recall: 0.6648
f1: 0.7393
Confusion Matrix:
[[469  73]
 [183 363]]

[4m5: Classifier Name: KNeighborsClassifier[0m
Accuracy: 0.8548
precision: 0.8380
recall: 0.8810
f1: 0.8589
Confusion Matrix:
[[449  93]
 [ 65 481]]

[4m6: Classifier Name: GaussianNB[0m
Accuracy: 0.5754
precision: 0.5435
recall: 0.9615
f1: 0.6944
Confusion Matrix

##### 4.3 Apply Machine Learning Algorithms on 3. Sen+Para-Level Dataset

###### 4.3.1. All stylometry features (43)

In [81]:
#split data into training and testing

X = all_level_Dataset_features.drop(['Label'], axis=1)  # Features
y = all_level_Dataset_features['Label']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [82]:
# Convert X_train and X_test to NumPy arrays
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# Ensure the arrays are contiguous
X_train_np = np.ascontiguousarray(X_train_np)
X_test_np = np.ascontiguousarray(X_test_np)


# Initialize classifiers
logreg = LogisticRegression()
dtree = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
knn = KNeighborsClassifier()
nb = GaussianNB()
lda = LinearDiscriminantAnalysis()
mlp = MLPClassifier(max_iter=1000)  # Adjust max_iter based on convergence
xgb = GradientBoostingClassifier()

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('logreg', logreg),
    ('dtree', dtree),
    ('rf', rf),
    ('svm', svm),
    ('knn', knn),
    ('nb', nb),
    ('lda', lda),
    ('mlp', mlp),
    ('xgb', xgb)
], voting='hard')

# Train and evaluate each classifier
classifiers = [logreg, dtree, rf, svm, knn, nb, lda, mlp, xgb, voting_clf]
print('\033[1;4mApply Machine Learning Algorithms on 3. Sen+Para-Level Dataset and All stylometry features\n\033[0m')

for i,clf in enumerate(classifiers):
    clf.fit(X_train_np, y_train)
    y_pred = clf.predict(X_test_np)
    print(f"\033[4m{i+1}: Classifier Name: {clf.__class__.__name__}\033[0m\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nprecision: {precision_score(y_test,y_pred):.4f}\nrecall: {recall_score(y_test,y_pred):.4f}\nf1: {f1_score(y_test,y_pred):.4f}\nConfusion Matrix:\n{confusion_matrix(y_test,y_pred)}\n")

[1;4mApply Machine Learning Algorithms on 3. Sen+Para-Level Dataset and All stylometry features
[0m
[4m1: Classifier Name: LogisticRegression[0m
Accuracy: 0.8152
precision: 0.8238
recall: 0.7879
f1: 0.8054
Confusion Matrix:
[[941 178]
 [224 832]]

[4m2: Classifier Name: DecisionTreeClassifier[0m
Accuracy: 0.9720
precision: 0.9654
recall: 0.9773
f1: 0.9713
Confusion Matrix:
[[1082   37]
 [  24 1032]]

[4m3: Classifier Name: RandomForestClassifier[0m
Accuracy: 0.9917
precision: 0.9896
recall: 0.9934
f1: 0.9915
Confusion Matrix:
[[1108   11]
 [   7 1049]]

[4m4: Classifier Name: SVC[0m
Accuracy: 0.8552
precision: 0.9186
recall: 0.7699
f1: 0.8377
Confusion Matrix:
[[1047   72]
 [ 243  813]]

[4m5: Classifier Name: KNeighborsClassifier[0m
Accuracy: 0.9352
precision: 0.9045
recall: 0.9688
f1: 0.9355
Confusion Matrix:
[[1011  108]
 [  33 1023]]

[4m6: Classifier Name: GaussianNB[0m
Accuracy: 0.7421
precision: 0.9381
recall: 0.5019
f1: 0.6539
Confusion Matrix:
[[1084   35]
 [ 526

###### 4.3.2. Word-Level stylometry features (10)

In [83]:
#split data into training and testing

X = all_level_Dataset_features.drop(['Label'], axis=1)  # Features
X = X.iloc[:, 0:10] #Word-Level stylometry features 
y = all_level_Dataset_features['Label']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [84]:
# Convert X_train and X_test to NumPy arrays
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# Ensure the arrays are contiguous
X_train_np = np.ascontiguousarray(X_train_np)
X_test_np = np.ascontiguousarray(X_test_np)


# Initialize classifiers
logreg = LogisticRegression()
dtree = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
knn = KNeighborsClassifier()
nb = GaussianNB()
lda = LinearDiscriminantAnalysis()
mlp = MLPClassifier(max_iter=1000)  # Adjust max_iter based on convergence
xgb = GradientBoostingClassifier()

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('logreg', logreg),
    ('dtree', dtree),
    ('rf', rf),
    ('svm', svm),
    ('knn', knn),
    ('nb', nb),
    ('lda', lda),
    ('mlp', mlp),
    ('xgb', xgb)
], voting='hard')

# Train and evaluate each classifier
classifiers = [logreg, dtree, rf, svm, knn, nb, lda, mlp, xgb, voting_clf]
print('\033[1;4mApply Machine Learning Algorithms on 3. Sen+Para-Level Dataset and Word-Level stylometry features\n\033[0m')

for i,clf in enumerate(classifiers):
    clf.fit(X_train_np, y_train)
    y_pred = clf.predict(X_test_np)
    print(f"\033[4m{i+1}: Classifier Name: {clf.__class__.__name__}\033[0m\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nprecision: {precision_score(y_test,y_pred):.4f}\nrecall: {recall_score(y_test,y_pred):.4f}\nf1: {f1_score(y_test,y_pred):.4f}\nConfusion Matrix:\n{confusion_matrix(y_test,y_pred)}\n")

[1;4mApply Machine Learning Algorithms on 3. Sen+Para-Level Dataset and Word-Level stylometry features
[0m
[4m1: Classifier Name: LogisticRegression[0m
Accuracy: 0.6901
precision: 0.6816
recall: 0.6790
f1: 0.6803
Confusion Matrix:
[[784 335]
 [339 717]]

[4m2: Classifier Name: DecisionTreeClassifier[0m
Accuracy: 0.9278
precision: 0.9158
recall: 0.9375
f1: 0.9265
Confusion Matrix:
[[1028   91]
 [  66  990]]

[4m3: Classifier Name: RandomForestClassifier[0m
Accuracy: 0.9669
precision: 0.9740
recall: 0.9574
f1: 0.9656
Confusion Matrix:
[[1092   27]
 [  45 1011]]

[4m4: Classifier Name: SVC[0m
Accuracy: 0.7752
precision: 0.8987
recall: 0.6051
f1: 0.7233
Confusion Matrix:
[[1047   72]
 [ 417  639]]

[4m5: Classifier Name: KNeighborsClassifier[0m
Accuracy: 0.8878
precision: 0.8658
recall: 0.9100
f1: 0.8873
Confusion Matrix:
[[970 149]
 [ 95 961]]

[4m6: Classifier Name: GaussianNB[0m
Accuracy: 0.7085
precision: 0.9089
recall: 0.4441
f1: 0.5967
Confusion Matrix:
[[1072   47]
 [ 

###### 4.3.3. Character-Level stylometry features (16)

In [85]:
#split data into training and testing

X = all_level_Dataset_features.drop(['Label'], axis=1)  # Features
X = X.iloc[:, 10:26] #Character-Level stylometry features 
y = all_level_Dataset_features['Label']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [86]:
# Convert X_train and X_test to NumPy arrays
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# Ensure the arrays are contiguous
X_train_np = np.ascontiguousarray(X_train_np)
X_test_np = np.ascontiguousarray(X_test_np)


# Initialize classifiers
logreg = LogisticRegression()
dtree = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
knn = KNeighborsClassifier()
nb = GaussianNB()
lda = LinearDiscriminantAnalysis()
mlp = MLPClassifier(max_iter=1000)  # Adjust max_iter based on convergence
xgb = GradientBoostingClassifier()

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('logreg', logreg),
    ('dtree', dtree),
    ('rf', rf),
    ('svm', svm),
    ('knn', knn),
    ('nb', nb),
    ('lda', lda),
    ('mlp', mlp),
    ('xgb', xgb)
], voting='hard')

# Train and evaluate each classifier
classifiers = [logreg, dtree, rf, svm, knn, nb, lda, mlp, xgb, voting_clf]
print('\033[1;4mApply Machine Learning Algorithms on 3. Sen+Para-Level Dataset and Character stylometry features\n\033[0m')

for i,clf in enumerate(classifiers):
    clf.fit(X_train_np, y_train)
    y_pred = clf.predict(X_test_np)
    print(f"\033[4m{i+1}: Classifier Name: {clf.__class__.__name__}\033[0m\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nprecision: {precision_score(y_test,y_pred):.4f}\nrecall: {recall_score(y_test,y_pred):.4f}\nf1: {f1_score(y_test,y_pred):.4f}\nConfusion Matrix:\n{confusion_matrix(y_test,y_pred)}\n")

[1;4mApply Machine Learning Algorithms on 3. Sen+Para-Level Dataset and Character stylometry features
[0m
[4m1: Classifier Name: LogisticRegression[0m
Accuracy: 0.7526
precision: 0.7462
recall: 0.7434
f1: 0.7448
Confusion Matrix:
[[852 267]
 [271 785]]

[4m2: Classifier Name: DecisionTreeClassifier[0m
Accuracy: 0.9582
precision: 0.9514
recall: 0.9631
f1: 0.9572
Confusion Matrix:
[[1067   52]
 [  39 1017]]

[4m3: Classifier Name: RandomForestClassifier[0m
Accuracy: 0.9811
precision: 0.9801
recall: 0.9811
f1: 0.9806
Confusion Matrix:
[[1098   21]
 [  20 1036]]

[4m4: Classifier Name: SVC[0m
Accuracy: 0.7706
precision: 0.9126
recall: 0.5833
f1: 0.7117
Confusion Matrix:
[[1060   59]
 [ 440  616]]

[4m5: Classifier Name: KNeighborsClassifier[0m
Accuracy: 0.8713
precision: 0.8514
recall: 0.8902
f1: 0.8704
Confusion Matrix:
[[955 164]
 [116 940]]

[4m6: Classifier Name: GaussianNB[0m
Accuracy: 0.6938
precision: 0.9185
recall: 0.4053
f1: 0.5624
Confusion Matrix:
[[1081   38]
 [ 6

###### 4.3.4. Sentence-Level stylometry features (7)

In [87]:
#split data into training and testing

X = all_level_Dataset_features.drop(['Label'], axis=1)  # Features
X = X.iloc[:, 26:33] #Sentence-Level stylometry features 
y = all_level_Dataset_features['Label']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [88]:
# Convert X_train and X_test to NumPy arrays
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# Ensure the arrays are contiguous
X_train_np = np.ascontiguousarray(X_train_np)
X_test_np = np.ascontiguousarray(X_test_np)


# Initialize classifiers
logreg = LogisticRegression()
dtree = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
knn = KNeighborsClassifier()
nb = GaussianNB()
lda = LinearDiscriminantAnalysis()
mlp = MLPClassifier(max_iter=1000)  # Adjust max_iter based on convergence
xgb = GradientBoostingClassifier()

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('logreg', logreg),
    ('dtree', dtree),
    ('rf', rf),
    ('svm', svm),
    ('knn', knn),
    ('nb', nb),
    ('lda', lda),
    ('mlp', mlp),
    ('xgb', xgb)
], voting='hard')

# Train and evaluate each classifier
classifiers = [logreg, dtree, rf, svm, knn, nb, lda, mlp, xgb, voting_clf]
print('\033[1;4mApply Machine Learning Algorithms on 3. Sen+Para-Level Dataset and Sentence stylometry features\n\033[0m')

for i,clf in enumerate(classifiers):
    clf.fit(X_train_np, y_train)
    y_pred = clf.predict(X_test_np)
    print(f"\033[4m{i+1}: Classifier Name: {clf.__class__.__name__}\033[0m\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nprecision: {precision_score(y_test,y_pred):.4f}\nrecall: {recall_score(y_test,y_pred):.4f}\nf1: {f1_score(y_test,y_pred):.4f}\nConfusion Matrix:\n{confusion_matrix(y_test,y_pred)}\n")

[1;4mApply Machine Learning Algorithms on 3. Sen+Para-Level Dataset and Sentence stylometry features
[0m
[4m1: Classifier Name: LogisticRegression[0m
Accuracy: 0.5857
precision: 0.5680
recall: 0.6127
f1: 0.5895
Confusion Matrix:
[[627 492]
 [409 647]]

[4m2: Classifier Name: DecisionTreeClassifier[0m
Accuracy: 0.9232
precision: 0.9174
recall: 0.9252
f1: 0.9213
Confusion Matrix:
[[1031   88]
 [  79  977]]

[4m3: Classifier Name: RandomForestClassifier[0m
Accuracy: 0.9490
precision: 0.9548
recall: 0.9394
f1: 0.9470
Confusion Matrix:
[[1072   47]
 [  64  992]]

[4m4: Classifier Name: SVC[0m
Accuracy: 0.7232
precision: 0.8429
recall: 0.5284
f1: 0.6496
Confusion Matrix:
[[1015  104]
 [ 498  558]]

[4m5: Classifier Name: KNeighborsClassifier[0m
Accuracy: 0.8924
precision: 0.8750
recall: 0.9081
f1: 0.8913
Confusion Matrix:
[[982 137]
 [ 97 959]]

[4m6: Classifier Name: GaussianNB[0m
Accuracy: 0.5710
precision: 0.6253
recall: 0.2907
f1: 0.3969
Confusion Matrix:
[[935 184]
 [749 3

###### 4.3.5. Para-Level stylometry features, Document-Level stylometry features, Textual Entropy stylometry features (10)

In [89]:
#split data into training and testing

X = sen_level_Dataset_features.drop(['Label'], axis=1)  # Features
X = X.iloc[:, 33:]
y = sen_level_Dataset_features['Label']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [90]:
# Convert X_train and X_test to NumPy arrays
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# Ensure the arrays are contiguous
X_train_np = np.ascontiguousarray(X_train_np)
X_test_np = np.ascontiguousarray(X_test_np)


# Initialize classifiers
logreg = LogisticRegression()
dtree = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
knn = KNeighborsClassifier()
nb = GaussianNB()
lda = LinearDiscriminantAnalysis()
mlp = MLPClassifier(max_iter=1000)  # Adjust max_iter based on convergence
xgb = GradientBoostingClassifier()

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('logreg', logreg),
    ('dtree', dtree),
    ('rf', rf),
    ('svm', svm),
    ('knn', knn),
    ('nb', nb),
    ('lda', lda),
    ('mlp', mlp),
    ('xgb', xgb)
], voting='hard')

# Train and evaluate each classifier
classifiers = [logreg, dtree, rf, svm, knn, nb, lda, mlp, xgb, voting_clf]
print('\033[1;4mApply Machine Learning Algorithms on 3. Sen+Para-Level Dataset and Para-Level , Document-Level, Textual Entropy Stylometry features\n\033[0m')

for i,clf in enumerate(classifiers):
    clf.fit(X_train_np, y_train)
    y_pred = clf.predict(X_test_np)
    print(f"\033[4m{i+1}: Classifier Name: {clf.__class__.__name__}\033[0m\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nprecision: {precision_score(y_test,y_pred):.4f}\nrecall: {recall_score(y_test,y_pred):.4f}\nf1: {f1_score(y_test,y_pred):.4f}\nConfusion Matrix:\n{confusion_matrix(y_test,y_pred)}\n")

[1;4mApply Machine Learning Algorithms on 3. Sen+Para-Level Dataset and Para-Level , Document-Level, Textual Entropy Stylometry features
[0m
[4m1: Classifier Name: LogisticRegression[0m
Accuracy: 0.6792
precision: 0.7627
recall: 0.5238
f1: 0.6211
Confusion Matrix:
[[453  89]
 [260 286]]

[4m2: Classifier Name: DecisionTreeClassifier[0m
Accuracy: 0.9540
precision: 0.9397
recall: 0.9707
f1: 0.9550
Confusion Matrix:
[[508  34]
 [ 16 530]]

[4m3: Classifier Name: RandomForestClassifier[0m
Accuracy: 0.9862
precision: 0.9819
recall: 0.9908
f1: 0.9863
Confusion Matrix:
[[532  10]
 [  5 541]]

[4m4: Classifier Name: SVC[0m
Accuracy: 0.7619
precision: 0.8345
recall: 0.6557
f1: 0.7344
Confusion Matrix:
[[471  71]
 [188 358]]

[4m5: Classifier Name: KNeighborsClassifier[0m
Accuracy: 0.8796
precision: 0.8571
recall: 0.9121
f1: 0.8838
Confusion Matrix:
[[459  83]
 [ 48 498]]

[4m6: Classifier Name: GaussianNB[0m
Accuracy: 0.6461
precision: 0.8286
recall: 0.3718
f1: 0.5133
Confusion Ma