<a href="https://colab.research.google.com/github/edojatheophilus/AI-vs-Human-Text-Detection/blob/Test/sample_dataset_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Load the dataset, Explore the dataset to understand its structure and contents**

In [None]:
# Importing necessary libraries
import pandas as pd                 # For data manipulation and analysis
import numpy as np                  # For numerical computations
import re                          # For regular expressions
import nltk                        # Natural Language Toolkit
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords  # Stopwords
from nltk.tokenize import word_tokenize  # Tokenization
from nltk.stem import WordNetLemmatizer  # Lemmatization
import spacy                       # Advanced NLP library
from spacy.lang.en import English
#from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
!pip install kaggle



In [None]:
from google.colab import files
files.upload()  # Upload your Kaggle API token (kaggle.json)
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle/
!kaggle datasets download -d shanegerami/ai-vs-human-text


Saving kaggle.json to kaggle.json
Downloading ai-vs-human-text.zip to /content
 99% 346M/350M [00:10<00:00, 37.0MB/s]
100% 350M/350M [00:10<00:00, 35.2MB/s]


In [None]:
!unzip ai-vs-human-text.zip

Archive:  ai-vs-human-text.zip
  inflating: AI_Human.csv            


In [None]:
#load the dataset
df = pd.read_csv('AI_Human.csv')

In [None]:
#Display the first 5 rows of the DataFrame
df.head()

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0.0
1,Transportation is a large necessity in most co...,0.0
2,"""America's love affair with it's vehicles seem...",0.0
3,How often do you ride in a car? Do you drive a...,0.0
4,Cars are a wonderful thing. They are perhaps o...,0.0


In [None]:
#dataset shape
df.shape

(487235, 2)

In [None]:
#take a sample data from dataset
df_zeros = df[df['generated'] == 0].head(8000)
df_ones = df[df['generated'] == 1].head(8000)

In [None]:
new_df = pd.concat([df_zeros, df_ones], ignore_index=True)

In [None]:
sample_df = new_df.copy()

In [None]:
# information about the DataFrame
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   text       16000 non-null  object 
 1   generated  16000 non-null  float64
dtypes: float64(1), object(1)
memory usage: 250.1+ KB


In [None]:
#  counts of total texts, human-written texts, and AI-generated texts in the 'generated' column of the DataFrame
total_text_count = sample_df['generated'].count()
human_written_txt_count = (sample_df['generated'] == 0.0).sum()
ai_generated_txt_count = (sample_df['generated'] == 1.0).sum()
print('Total Texts:', total_text_count)
print('Human Written Texts:', human_written_txt_count )
print('AI Generated Texts:', ai_generated_txt_count )


Total Texts: 16000
Human Written Texts: 8000
AI Generated Texts: 8000


# **02. DATA CLEANING AND PERFORM PREPROCESSING**

## ***Remove List of tags  (newline characters and single quotes)***

In [None]:
# Define a function to remove specific tags from text

def remove_tags(text):
    # List of tags to remove (newline characters and single quotes)
    tags_to_remove = ['\n', '\'']

    # Iterate through the tags and replace them with an empty string
    for tag in tags_to_remove:
        if tag == '\n':
            text = text.replace(tag, ' ')
        else:
            text = text.replace(tag, '')

    return text

# Apply the remove_tags function to the 'text' column of the DataFrame
# This line iterates over all rows in the 'text' column and removes the specified tags
sample_df['text'] = sample_df['text'].apply(remove_tags)


# ***Count of punctuation marks***

In [None]:
import string
from nltk.tokenize import word_tokenize

def count_punctuation_marks(text):
    punctuation_count = sum(1 for char in text if char in string.punctuation)
    return punctuation_count

# Create new columns for punctuation and linking words count
sample_df['punctuation_count'] = sample_df['text'].apply(count_punctuation_marks)



# ***Linking words count***

In [None]:
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

# Initialize Porter Stemmer and set of stopwords
porter = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Define additional linking words
additional_linking_words = {'to', 'the', 'and', 'of', 'in', 'on', 'for', 'with', 'at', 'a', 'an'}

# Combine stopwords and additional linking words
linking_words = stop_words.union(additional_linking_words)

def count_linking_words(text):
    # Tokenize the text and count the number of linking words
    linking_words_count = sum(1 for word in word_tokenize(text.lower()) if word in linking_words)
    return linking_words_count

sample_df['linking_words_count'] = sample_df['text'].apply(count_linking_words)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import pickle

# Save the processed DataFrame sample_df and selected columns to a pickle file
selected_columns = ['punctuation_count', 'linking_words_count']

processed_data = {
    'df_p_1': sample_df[selected_columns]
}

with open('processed_data_1.pkl', 'wb') as f:
    pickle.dump(processed_data, f)

# ***Take the word count***

In [None]:
from collections import Counter
from nltk.tokenize import word_tokenize
# df_2 = sample_df.copy()
# # Function to tokenize text and remove stopwords
# def tokenize_text(text):
#     tokens = word_tokenize(text.lower())
#     stop_words = set(stopwords.words('english'))
#     return [token for token in tokens if token.isalpha() and token not in stop_words]

# Function to count word occurrences in each text
def count_word_occurrences(text):
    tokens = word_tokenize(text.lower())
    word_counts = Counter(tokens)
    return sum(word_counts.values())

# Create a new column 'word_count' for the total word count in each text
sample_df['word_count'] = sample_df['text'].apply(count_word_occurrences)

In [None]:
import pickle

# Save the processed DataFrame sample_df and selected columns to a pickle file
selected_columns_2 = ['word_count']

processed_data = {
    'df_p_2': sample_df[selected_columns_2]
}

with open('processed_data_2.pkl', 'wb') as f:
    pickle.dump(processed_data, f)

# ***Finding the length of each text***

In [None]:
from nltk.tokenize import word_tokenize

# Function to calculate text length
def lengthText(text):
    tokens = word_tokenize(text.lower())
    return len(tokens)

# Calculate text length for each text
sample_df['length_text'] = sample_df['text'].apply(lengthText)


# ***Spell Check***

In [None]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1


**'spelling_status' where 1 indicates correct spelling and 0 indicates incorrect spelling**

In [None]:
import nltk
from nltk.corpus import words

nltk.download('words')
english_words = set(words.words())

# Function to check if a word is spelled correctly
def is_spelled_correctly(word):
    return word in english_words

# Initialize a list for spelling status
spelling_status_list = []

# Loop through each row in the 'text' column
for word_to_check in sample_df['text']:
    if is_spelled_correctly(word_to_check):
        spelling_status_list.append(1)  # 1 for correct
    else:
        spelling_status_list.append(0)  # 0 for incorrect

# Add a new column 'spelling_status' to the DataFrame
sample_df['spelling_status'] = spelling_status_list

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [None]:
import pickle

# Save the processed DataFrame df_1 and selected columns to a pickle file
selected_columns_3 = ['spelling_status']

processed_data = {
    'df_p_3': sample_df[selected_columns_3]
}

with open('processed_data_3.pkl', 'wb') as f:
    pickle.dump(processed_data, f)

**Check 1. POS tagger 2. name entity, 3. parser, 4. EDM 5. slangs 6. abbreviations for whole data set**

# **Email Detection**

In [None]:
def email_detection(text):
    email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    return re.findall(email_regex, text, re.IGNORECASE)

sample_df['emails'] = sample_df['text'].apply(email_detection)

# **Slangs Detection**

In [None]:
def slangs_detection(text):
    slang_regex = r'\b(?:lol|brb|omg)\b'
    return re.findall(slang_regex, text, flags=re.IGNORECASE)

sample_df['slangs'] = sample_df['text'].apply(slangs_detection)

In [None]:
# # Export to CSV with index included
sample_df.to_csv('formatted_dataset.csv')

In [None]:
# Create a zip file and add the CSV file to it
import zipfile
with zipfile.ZipFile('formatted_dataset.zip', 'w') as zipf:
    zipf.write('formatted_dataset.csv', arcname='formatted_dataset.csv')

In [None]:
import pickle
import gzip

# Save the processed DataFrame df_1 and selected columns to a compressed pickle file
selected_columns_all = ['text','generated','punctuation_count', 'linking_words_count', 'word_count', 'length_text', 'spelling_status','emails','slangs']

processed_data = {
    'df_p_all': sample_df[selected_columns_all]
}

with gzip.open('processed_data_4.pkl.gz', 'wb') as f:
    pickle.dump(processed_data, f)


NameError: name 'sample_df' is not defined

# **Loading sample pickle file**

In [34]:
#start from here--Namita
import pandas as pd
import pickle
import gzip

# Load the compressed pickle file
with gzip.open('processed_sample_data_4.pkl.gz', 'rb') as f:
    loaded_data = pickle.load(f)

# Extract the DataFrame from the loaded data
df_preprocessed = loaded_data['df_p_all']

# Now you can use df_p_all as your DataFrame

In [35]:
df_preprocessed

Unnamed: 0,text,generated,punctuation_count,linking_words_count,word_count,length_text,spelling_status,emails,slangs
0,Cars. Cars have been around since they became ...,0.0,72,261,654,654,0,[],[]
1,Transportation is a large necessity in most co...,0.0,59,197,521,521,0,[],[]
2,"""Americas love affair with its vehicles seems ...",0.0,93,331,835,835,0,[],[]
3,How often do you ride in a car? Do you drive a...,0.0,112,310,793,793,0,[],[]
4,Cars are a wonderful thing. They are perhaps o...,0.0,106,389,963,963,0,[],[]
...,...,...,...,...,...,...,...,...,...
15995,There is no definitive answer to this question...,1.0,19,71,169,169,0,[],[]
15996,The age-old debate about whether younger peopl...,1.0,68,283,632,632,0,[],[]
15997,It is a common belief that young people enjoy ...,1.0,40,187,417,417,0,[],[]
15998,Life is a precious gift that everyone should c...,1.0,57,211,499,499,0,[],[]


In [36]:
def extract_features(text):
    # Character count
    char_count = len(text)

    # Word count
    word_count = len(text.split())

    # Average word length
    avg_word_length = sum(len(word) for word in text.split()) / word_count if word_count > 0 else 0

    # Count of uppercase letters
    uppercase_count = sum(1 for char in text if char.isupper())

    # Count of digits
    digit_count = sum(1 for char in text if char.isdigit())

    return char_count, word_count, avg_word_length, uppercase_count, digit_count

In [37]:
# Apply feature extraction function to each text
df_preprocessed['char_count'], df_preprocessed['word_count'], df_preprocessed['avg_word_length'], df_preprocessed['uppercase_count'], df_preprocessed['digit_count'] = zip(*df_preprocessed['text'].map(extract_features))

In [38]:
# Save the DataFrame with extracted features to a new compressed pickle file
selected_columns_with_features = ['text', 'generated', 'punctuation_count', 'linking_words_count', 'word_count', 'char_count', 'avg_word_length', 'uppercase_count', 'digit_count', 'length_text', 'spelling_status', 'emails', 'slangs']
processed_data_with_features = {'df_with_features': df_preprocessed[selected_columns_with_features]}

with gzip.open('processed_data_with_features.pkl.gz', 'wb') as f:
    pickle.dump(processed_data_with_features, f)

In [39]:
# Load the compressed pickle file with the processed data and features
with gzip.open('processed_data_with_features.pkl.gz', 'rb') as f:
    loaded_data = pickle.load(f)

In [40]:
# Extract the DataFrame with features from the loaded data
df_with_features = loaded_data['df_with_features']

In [41]:
# Display the DataFrame
df_with_features.head()

Unnamed: 0,text,generated,punctuation_count,linking_words_count,word_count,char_count,avg_word_length,uppercase_count,digit_count,length_text,spelling_status,emails,slangs
0,Cars. Cars have been around since they became ...,0.0,72,261,584,3286,4.623288,59,13,654,0,[],[]
1,Transportation is a large necessity in most co...,0.0,59,197,462,2733,4.909091,46,16,521,0,[],[]
2,"""Americas love affair with its vehicles seems ...",0.0,93,331,744,4420,4.936828,62,0,835,0,[],[]
3,How often do you ride in a car? Do you drive a...,0.0,112,310,686,4001,4.830904,77,1,793,0,[],[]
4,Cars are a wonderful thing. They are perhaps o...,0.0,106,389,871,4694,4.386912,74,11,963,0,[],[]


In [42]:
df_preprocessed.dtypes

text                    object
generated              float64
punctuation_count        int64
linking_words_count      int64
word_count               int64
length_text              int64
spelling_status          int64
emails                  object
slangs                  object
char_count               int64
avg_word_length        float64
uppercase_count          int64
digit_count              int64
dtype: object

In [43]:
# Check if all entries in the 'emails' column are empty lists
all_empty_emails = all(len(x) == 0 for x in df_with_features['emails'])

print("All entries in 'emails' column are empty lists:", all_empty_emails)

All entries in 'emails' column are empty lists: True


In [44]:
# Printing unique values in the 'emails' and 'slangs' columns
unique_emails = df_with_features['emails'].apply(lambda x: tuple(x) if isinstance(x, list) else x).unique()
unique_slangs = df_with_features['slangs'].apply(lambda x: tuple(x) if isinstance(x, list) else x).unique()

print("Unique values (as tuples) in 'emails' column:", unique_emails)
print("Unique values (as tuples) in 'slangs' column:", unique_slangs)

Unique values (as tuples) in 'emails' column: [()]
Unique values (as tuples) in 'slangs' column: [() ('OMG',) ('Lol',) ('omg',) ('LOL',) ('lol',)]


In [45]:
# Correctly transform the 'slangs' column: replace non-empty lists with 1, and empty lists or non-list values with 0
df_with_features['slangs'] = df_with_features['slangs'].apply(lambda x: 1 if isinstance(x, list) and len(x) > 0 else 0)

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed

# Fit and transform the text data to obtain the TF-IDF features
tfidf_features = tfidf_vectorizer.fit_transform(df_with_features['text'])

# Convert the TF-IDF features to a DataFrame
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate the TF-IDF features with the existing features DataFrame
df_with_features = pd.concat([df_with_features, tfidf_df], axis=1)

# Now df_with_features contains both the original features and the TF-IDF features

In [47]:
df_with_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Columns: 1013 entries, text to yourself
dtypes: float64(1002), int64(9), object(2)
memory usage: 123.7+ MB


In [106]:
from sklearn.model_selection import train_test_split

# Shuffle DataFrame to ensure random selection of records
df_shuffled = df_with_features.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate 100 records for unseen testing
df_unseen = df_shuffled[:100]
df_rest = df_shuffled[100:]

# Train-test split on df_rest
X = df_rest.drop(['generated', 'text', 'emails'], axis=1)
y = df_rest['generated']

In [107]:
df_unseen_target = df_unseen['generated'] # selecting target variable
df_unseen_test = df_unseen.drop(['generated', 'text', 'emails'], axis=1) # dropping email column since it is empty, and dropping text after obtaining tf-idf matrix
df_unseen_test

Unnamed: 0,punctuation_count,linking_words_count,word_count,char_count,avg_word_length,uppercase_count,digit_count,length_text,spelling_status,slangs,...,writing,wrong,year,years,yet,you,young,your,youre,yourself
0,68,238,562,3734,5.626335,83,6,624,0,0,...,0.030277,0.0,0.0,0.000000,0.000000,0.043895,0.0,0.034172,0.0,0.0
1,77,346,677,3776,4.573117,57,16,755,0,0,...,0.000000,0.0,0.0,0.000000,0.000000,0.029619,0.0,0.000000,0.0,0.0
2,20,91,187,1011,4.401070,27,2,207,0,0,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
3,48,203,405,2161,4.328395,44,0,453,0,0,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.032317,0.0,0.0
4,75,205,529,3513,5.629490,47,10,594,0,0,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,30,139,283,1585,4.590106,29,5,313,0,0,...,0.000000,0.0,0.0,0.000000,0.000000,0.063539,0.0,0.000000,0.0,0.0
96,30,146,308,1790,4.801948,43,10,338,0,0,...,0.000000,0.0,0.0,0.000000,0.063107,0.000000,0.0,0.000000,0.0,0.0
97,35,137,299,1859,5.200669,16,0,334,0,0,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
98,49,271,523,3066,4.850860,30,0,572,0,0,...,0.000000,0.0,0.0,0.026996,0.000000,0.000000,0.0,0.000000,0.0,0.0


In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [109]:
X_train.shape

(12720, 1009)

In [110]:
X_test.shape

(3180, 1009)

In [111]:
df_unseen_test.shape

(100, 1009)

In [112]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit on training set only
scaler.fit(X_train)

# Apply transform to the training set, the test set, unseen set
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
df_unseen_test_scaled = scaler.transform(df_unseen_test)

In [113]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize the Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the classifier to the training data
rf_clf.fit(X_train_scaled, y_train)

# Evaluate the model on unseen data
rf_pred = rf_clf.predict(df_unseen_test_scaled)
rf_accuracy = accuracy_score(df_unseen_target, rf_pred)
print(f"Random Forest Accuracy on df unseen test: {rf_accuracy}")

# Evaluate the model on test data
rf_pred = rf_clf.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f"Random Forest Accuracy on y test: {rf_accuracy}")

Random Forest Accuracy on df unseen test: 1.0
Random Forest Accuracy on y test: 0.989937106918239


In [114]:
# Import SVM classifier and accuracy score calculation
from sklearn.svm import SVC

# Initialize SVM classifier with 'auto' gamma
svm_clf = SVC(gamma='auto')

# Train the SVM model on scaled training data
svm_clf.fit(X_train_scaled, y_train)

# Predict using the trained SVM model on df unseen test data
svm_pred = svm_clf.predict(df_unseen_test_scaled)
svm_accuracy = accuracy_score(df_unseen_target, svm_pred)
print(f"SVM Accuracy on df unseen test: {svm_accuracy}")

# Predict using the trained SVM model on scaled test data
svm_pred = svm_clf.predict(X_test_scaled)
svm_accuracy = accuracy_score(y_test, svm_pred)
print(f"SVM Accuracy on scaled test data: {svm_accuracy}")

SVM Accuracy on df unseen test: 1.0
SVM Accuracy on scaled test data: 0.9971698113207547


In [136]:
# To prepare the data to be used as input for RNN model we need to take steps that would include tokenization and sequence padding

df_shuffled = df_preprocessed.sample(frac=1, random_state=42).reset_index(drop=True) # Shuffling

df_unseen = df_shuffled[:100]
df_rest = df_shuffled[100:]

texts = df_rest['text'].values  # Extract text data
labels = df_rest['generated'].values  # Extract target variable

unseen_texts = df_unseen['text'].values
unseen_labels = df_unseen['generated'].values

In [124]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

max_features = 10000  # Maximum number of words in the vocabulary

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)
unseen_sequences = tokenizer.texts_to_sequences(unseen_texts)

In [125]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

maxlen = 100  # Maximum length of a sequence
X = pad_sequences(sequences, maxlen=maxlen)
X_unseen = pad_sequences(unseen_sequences, maxlen=maxlen)

In [126]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [127]:
X_train

array([[  26, 3159,  168, ..., 4495, 1394,  197],
       [  12,  109,    2, ...,   15,    1,  170],
       [ 311,    1, 5488, ...,  266,    6,   67],
       ...,
       [1607,   60, 1176, ...,  403,  489, 3787],
       [ 810,  699,   36, ...,  200,    2,  479],
       [   9,  902,    4, ...,    6,   51,  546]], dtype=int32)

In [128]:
X_train.shape

(12720, 100)

In [130]:
X_unseen.shape

(100, 100)

In [131]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Model parameters
max_features = 10000  # Size of the vocabulary
maxlen = 100  # Maximum length of a sequence
embedding_dim = 32  # Dimensionality of the embedding layer

# Building the model
model = Sequential()
model.add(Embedding(max_features, embedding_dim, input_length=maxlen))
model.add(SimpleRNN(32))  # 32 units in the RNN layer
model.add(Dense(1, activation='sigmoid'))  # Binary classification

# Compiling the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

# Model summary
model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 100, 32)           320000    
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 322113 (1.23 MB)
Trainable params: 322113 (1.23 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [135]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate the model on test data
test_predictions = model.predict(X_test)
test_predicted_classes = (test_predictions > 0.5).astype(int)

# Calculate metrics for the test set
test_accuracy = accuracy_score(y_test, test_predicted_classes)
test_precision = precision_score(y_test, test_predicted_classes)
test_recall = recall_score(y_test, test_predicted_classes)
test_f1 = f1_score(y_test, test_predicted_classes)

print("Test Set Performance:")
print(f"Accuracy: {test_accuracy}")
print(f"Precision: {test_precision}")
print(f"Recall: {test_recall}")
print(f"F1 Score: {test_f1}")
print("\n")

# Evaluate the model on test data
unseen_predictions = model.predict(X_unseen)
unseen_predicted_classes = (unseen_predictions > 0.5).astype(int)

# Calculate metrics for the unseen dataset
unseen_accuracy = accuracy_score(unseen_labels, unseen_predicted_classes)
unseen_precision = precision_score(unseen_labels, unseen_predicted_classes)
unseen_recall = recall_score(unseen_labels, unseen_predicted_classes)
unseen_f1 = f1_score(unseen_labels, unseen_predicted_classes)

print("Unseen Dataset Performance:")
print(f"Accuracy: {unseen_accuracy}")
print(f"Precision: {unseen_precision}")
print(f"Recall: {unseen_recall}")
print(f"F1 Score: {unseen_f1}")

Test Set Performance:
Accuracy: 0.9795597484276729
Precision: 0.975609756097561
Recall: 0.982546864899806
F1 Score: 0.9790660225442833


Unseen Dataset Performance:
Accuracy: 0.99
Precision: 1.0
Recall: 0.98
F1 Score: 0.98989898989899
