In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
from nltk.corpus import stopwords
import spacy
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense,Dropout,BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
import keras_tuner as kt
import os
import shutil

In [2]:
df=pd.read_csv(r"C:\Users\dkdes\OneDrive\Desktop\kaggle_datasets\large AI_Human Text.csv")
df.head()

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0
3,How often do you ride in a car? Do you drive a...,0
4,Cars are a wonderful thing. They are perhaps o...,0


In [3]:
df1=df.sample(1000,random_state=2)

In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 11006 to 314444
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       1000 non-null   object
 1   generated  1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 23.4+ KB


In [5]:
df1["generated"].value_counts()

0    633
1    367
Name: generated, dtype: int64

In [6]:
df1["generated"].value_counts()[0]/df1["generated"].value_counts().sum()

0.633

In [7]:
df1["generated"].value_counts()[1]/df1["generated"].value_counts().sum()

0.367

In [8]:
df["generated"].value_counts()

0    305797
1    181438
Name: generated, dtype: int64

## Text Preprocessing

In [9]:
df1["text"].str.lower()

11006     nasa noticed something unfimiliar on the red p...
486485    to many people it was confusing, but to me, it...
470298    advantages of limiting car usage\n\nlimiting c...
19968     \n\nworking with a group can be menacing and r...
10207     recently we have discovered a new landform on ...
                                ...                        
91063     dear state senator,\n\ni think that changing t...
278866    dear, teacher_name\n\ni am aware of the your d...
435779    there is a debate whether electoral cortege sh...
265478    the open seas await\n\nwhen i first joined the...
314444    have you even looked at someone and thought, i...
Name: text, Length: 1000, dtype: object

In [10]:
df1["text"]=df1["text"].str.lower()

In [11]:
df1.head(10)

Unnamed: 0,text,generated
11006,nasa noticed something unfimiliar on the red p...,0
486485,"to many people it was confusing, but to me, it...",0
470298,advantages of limiting car usage\n\nlimiting c...,1
19968,\n\nworking with a group can be menacing and r...,1
10207,recently we have discovered a new landform on ...,0
367556,"title:""become a seagoing cowboy of a lifetime""...",0
478101,"dear [state senator],\n\ni am writing to expr...",1
484783,x am luke merger and x have been to unique pla...,0
454480,students learn better if they are interested i...,0
463534,there are a few reason why younger people migh...,1


In [12]:
# function for removing html tags if any 
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [13]:
df1['text'] = df1['text'].apply(remove_html_tags)

In [14]:
df1.head()

Unnamed: 0,text,generated
11006,nasa noticed something unfimiliar on the red p...,0
486485,"to many people it was confusing, but to me, it...",0
470298,advantages of limiting car usage\n\nlimiting c...,1
19968,\n\nworking with a group can be menacing and r...,1
10207,recently we have discovered a new landform on ...,0


In [15]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [16]:
df1['text'] = df1['text'].apply(remove_url)

In [17]:
df1.head()

Unnamed: 0,text,generated
11006,nasa noticed something unfimiliar on the red p...,0
486485,"to many people it was confusing, but to me, it...",0
470298,advantages of limiting car usage\n\nlimiting c...,1
19968,\n\nworking with a group can be menacing and r...,1
10207,recently we have discovered a new landform on ...,0


In [18]:
string.punctuation
exclude = string.punctuation

In [19]:
def remove_punc1(text):
    return text.translate(str.maketrans('', '', exclude))

In [20]:
df1['text'] = df1['text'].apply(remove_punc1)

In [21]:
df1.loc[df1["text"].str.contains("\n\n")]

Unnamed: 0,text,generated
11006,nasa noticed something unfimiliar on the red p...,0
486485,to many people it was confusing but to me it w...,0
470298,advantages of limiting car usage\n\nlimiting c...,1
19968,\n\nworking with a group can be menacing and r...,1
10207,recently we have discovered a new landform on ...,0
...,...,...
91063,dear state senator\n\ni think that changing to...,0
278866,dear teachername\n\ni am aware of the your dil...,0
435779,there is a debate whether electoral cortege sh...,0
265478,the open seas await\n\nwhen i first joined the...,1


In [22]:
df1['text']=df1["text"].str.replace("\n\n","")

In [23]:
df1.loc[df1["text"].str.contains("\n\n")]

Unnamed: 0,text,generated


In [24]:
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek You (also a chat program)",
    "ILU": "I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "LOL": "Laughing out loud",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don’t care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "LMAO": "Laughing my a** off",
    "BFF": "Best friends forever",
    "CSL": "Can’t stop laughing"
}


In [25]:
def chat_conversion(text):
    
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [26]:
chat_conversion('IMHO he is the best')

'In My Honest/Humble Opinion he is the best'

In [27]:
df1['text'] = df1['text'].apply(chat_conversion)

In [28]:
df1.head()

Unnamed: 0,text,generated
11006,nasa noticed something unfimiliar on the red p...,0
486485,to many people it was confusing but to me it w...,0
470298,advantages of limiting car usagelimiting car u...,1
19968,working with a group can be menacing and rewar...,1
10207,recently we have discovered a new landform on ...,0


In [29]:
def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [30]:
df1['text'] = df1['text'].apply(remove_stopwords)

In [31]:
nlp = spacy.load('en_core_web_sm')

In [32]:
def tokenize_text(text):
    doc = nlp(text)
    return [token.text for token in doc]

In [33]:
df1['text'] = df1['text'].apply(tokenize_text)

In [34]:



nlp = spacy.load("en_core_web_sm")


def lemmatize_text(text):
    if isinstance(text, str):
        doc = nlp(text)
        return " ".join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])
    else:
        return text  # Return the text as-is if it's not a string

# Create a sample pandas DataFrame
data = {'text_column': ["This is a sample text.", "Here is another sentence.", 12345, None]}
df = pd.DataFrame(data)

# Convert all values in the DataFrame column to strings
df1['text'] = df1['text'].astype(str)

# Apply the lemmatization function to the DataFrame column
df1['text'] = df1['text'].apply(lemmatize_text)

# Print the DataFrame
print(df1.head())



                                                     text  generated
11006   nasa notice something unfimiliar red planet ca...          0
486485  many people confuse life every night would com...          0
470298  advantage limit car usagelimite car usage brin...          1
19968   work group menacing rewarding equal measure on...          1
10207   recently discover new landform mar vike 1 catc...          0


In [35]:
X=df1["text"]
y=df1["generated"]

In [36]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [37]:
# Initialize the TfidfVectorizer
tfv = TfidfVectorizer()

# Fit the TfidfVectorizer only on the training data
tfv.fit(X_train)

# Transform the training and test data
X_train_tfv = tfv.transform(X_train)
X_test_tfv = tfv.transform(X_test)

# Print the shape of the transformed data
print(X_train_tfv.shape)
print(X_test_tfv.shape)

(800, 11771)
(200, 11771)


In [38]:
def model_name(a):
    model=a
    model.fit(X_train_tfv, y_train)
    y_pred=model.predict(X_test_tfv)
    accuracy=accuracy_score(y_test,y_pred)
    f1=f1_score(y_test,y_pred)
    print("accuracy:",accuracy)
    print("f1_score:",f1)
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print("model on training data",np.mean(cross_val_score(model,X_train_tfv,y_train,cv=5,scoring="accuracy")))
    print("model on testing data",np.mean(cross_val_score(model,X_test_tfv,y_test,cv=5,scoring="accuracy")))

In [39]:
model_name(RandomForestClassifier(n_estimators=100))

accuracy: 0.935
f1_score: 0.8849557522123894
[[137   1]
 [ 12  50]]
              precision    recall  f1-score   support

           0       0.92      0.99      0.95       138
           1       0.98      0.81      0.88        62

    accuracy                           0.94       200
   macro avg       0.95      0.90      0.92       200
weighted avg       0.94      0.94      0.93       200

model on training data 0.9125
model on testing data 0.8400000000000001


In [40]:
model_name(LogisticRegression())

accuracy: 0.93
f1_score: 0.8727272727272727
[[138   0]
 [ 14  48]]
              precision    recall  f1-score   support

           0       0.91      1.00      0.95       138
           1       1.00      0.77      0.87        62

    accuracy                           0.93       200
   macro avg       0.95      0.89      0.91       200
weighted avg       0.94      0.93      0.93       200

model on training data 0.89375
model on testing data 0.7249999999999999


In [41]:
model_name(SVC())

accuracy: 0.955
f1_score: 0.9217391304347826
[[138   0]
 [  9  53]]
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       138
           1       1.00      0.85      0.92        62

    accuracy                           0.95       200
   macro avg       0.97      0.93      0.95       200
weighted avg       0.96      0.95      0.95       200

model on training data 0.9112500000000001
model on testing data 0.76


In [42]:
model_name(KNeighborsClassifier())

accuracy: 0.88
f1_score: 0.7966101694915254
[[129   9]
 [ 15  47]]
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       138
           1       0.84      0.76      0.80        62

    accuracy                           0.88       200
   macro avg       0.87      0.85      0.86       200
weighted avg       0.88      0.88      0.88       200

model on training data 0.85
model on testing data 0.8399999999999999


## Training the model with help of Deep Learning

In [43]:
model=Sequential()
model.add(Dense(32,activation='relu',input_dim=11771))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(32,activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(32,activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))

In [44]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 32)                376704    
                                                                 
 batch_normalization (Batch  (None, 32)                128       
 Normalization)                                                  
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                1056      
                                                                 
 batch_normalization_1 (Bat  (None, 32)                128       
 chNormalization)                                                
                                                                 
 dropout_1 (Dropout)         (None, 32)                0

In [45]:
model.compile(optimizer="adam",metrics=["accuracy"],loss="binary_crossentropy")

In [46]:
callbacks=EarlyStopping(monitor="val_loss",patience=5)

In [47]:

# Convert the sparse matrices to dense arrays
X_train_tfv_dense = X_train_tfv.toarray()
X_test_tfv_dense = X_test_tfv.toarray()

# Fit the model with validation_split
history = model.fit(X_train_tfv_dense, y_train, validation_split=0.2, epochs=100, batch_size=52,callbacks=callbacks)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100


In [48]:
tuner_dir = 'my_dir'
if os.path.exists(tuner_dir):
    shutil.rmtree(tuner_dir)

def build_model(hp):
    model = Sequential()
    num_layers = hp.Int("num_layers", min_value=1, max_value=10)
    
    for i in range(num_layers):
        if i == 0:
            model.add(Dense(
                units=hp.Int(f"units_{i}", min_value=16, max_value=128, step=8),
                activation=hp.Choice(f"activation_{i}", values=["relu", "tanh", "sigmoid"]),
                input_dim=11771
            ))
        else:
            model.add(Dense(
                units=hp.Int(f"units_{i}", min_value=16, max_value=128, step=8),
                activation=hp.Choice(f"activation_{i}", values=["relu", "tanh", "sigmoid"])
            ))
        model.add(BatchNormalization())
        model.add(Dropout(rate=hp.Float(f"dropout_{i}", min_value=0.0, max_value=0.9, step=0.1)))
    
    model.add(Dense(1, activation='sigmoid'))  
    model.compile(
        optimizer=hp.Choice("optimizer", values=["adam", "rmsprop", "sgd", "nadam", "adadelta"]),
        loss="binary_crossentropy",  
        metrics=["accuracy"]
    )
    return model


In [49]:
tuner = kt.RandomSearch(
    build_model,
    objective="val_accuracy",
    max_trials=3,
    directory=tuner_dir,
    project_name='my_project'
)

In [50]:
tuner.search(X_train_tfv_dense,y_train,epochs=5,validation_data=(X_test_tfv_dense,y_test))

Trial 3 Complete [00h 00m 06s]
val_accuracy: 0.3100000023841858

Best val_accuracy So Far: 0.6899999976158142
Total elapsed time: 00h 00m 20s


In [51]:
tuner.get_best_hyperparameters()[0].values

{'num_layers': 7,
 'units_0': 40,
 'activation_0': 'relu',
 'dropout_0': 0.30000000000000004,
 'optimizer': 'nadam',
 'units_1': 16,
 'activation_1': 'relu',
 'dropout_1': 0.0,
 'units_2': 16,
 'activation_2': 'relu',
 'dropout_2': 0.0,
 'units_3': 16,
 'activation_3': 'relu',
 'dropout_3': 0.0,
 'units_4': 16,
 'activation_4': 'relu',
 'dropout_4': 0.0,
 'units_5': 16,
 'activation_5': 'relu',
 'dropout_5': 0.0,
 'units_6': 16,
 'activation_6': 'relu',
 'dropout_6': 0.0}

In [52]:
model.fit(X_train_tfv_dense,y_train,epochs=100,validation_split=0.2,callbacks=callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


<keras.src.callbacks.History at 0x2178cb6a920>

In [53]:
y_pred_prob = model.predict(X_test_tfv)
y_pred = (y_pred_prob > 0.5).astype(int)

# Classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       138
           1       1.00      0.90      0.95        62

    accuracy                           0.97       200
   macro avg       0.98      0.95      0.96       200
weighted avg       0.97      0.97      0.97       200



In [55]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [79]:
tokenizer.fit_on_texts(X_train)  # X_train should be a list of text data

# Get vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for padding token

print(f"Vocabulary Size: {vocab_size}")

Vocabulary Size: 11829


In [91]:
# Tokenization
tokenizer = Tokenizer(num_words=11830, oov_token="<OOV>")  # Keep 5000 most frequent words
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding (to ensure uniform input size)
max_length = 100  # Max words per sentence
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding="post", truncating="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding="post", truncating="post")

In [102]:
# Define LSTM model
model = Sequential([
    Embedding(input_dim=11830, output_dim=128, input_length=max_length),  # Word embedding
    LSTM(64, return_sequences=True),  # First LSTM layer (can add more layers if needed)
    Dropout(0.3),
    LSTM(32,return_sequences=True),  # Second LSTM layer
    Dropout(0.3),
    LSTM(32),
    Dropout(0.3),
    Dense(32, activation="relu"),
    Dense(1, activation="sigmoid")  # Output layer (binary classification)
])

# Compile the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Model summary
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 100, 128)          1514240   
                                                                 
 lstm_32 (LSTM)              (None, 100, 64)           49408     
                                                                 
 dropout_32 (Dropout)        (None, 100, 64)           0         
                                                                 
 lstm_33 (LSTM)              (None, 100, 32)           12416     
                                                                 
 dropout_33 (Dropout)        (None, 100, 32)           0         
                                                                 
 lstm_34 (LSTM)              (None, 32)                8320      
                                                                 
 dropout_34 (Dropout)        (None, 32)               

In [106]:
# Train the model
history = model.fit(X_train_pad, y_train, epochs=100, batch_size=32, validation_data=(X_test_pad, y_test),callbacks=callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


In [107]:
y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int)

# Classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.93      0.93       138
           1       0.84      0.87      0.86        62

    accuracy                           0.91       200
   macro avg       0.89      0.90      0.90       200
weighted avg       0.91      0.91      0.91       200

