# LLM generation text detection 

In [9]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import keras
import keras_nlp
import re
import spacy
import time
import string


train_essays = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")
train_prompts = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv")
train_daigt = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv")




## Datasets 

In [10]:
train_daigt["generated"] = train_daigt["label"]
train_essays = train_essays.merge(train_prompts, on='prompt_id', how='inner')

train_essays = pd.concat([train_essays,train_daigt])

#train_essays.loc[:,["prompt_name","generated"]].value_counts()
train_essays = train_essays.sample(frac = 1)

train_essays = train_essays.loc[:,["text","generated"]]

train_essays = train_essays.iloc[:10]

## Preprocessing

In [11]:
# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

table = str.maketrans("", "", string.punctuation)
table[10] = None #\n
table[92] = None #\


#to lower
for code in range(26):
    table[code + 65] = code +97




def preprocess_text(text):
    
    #characters level preprocessing
    #remove \n and \, remove puntuactions, to lower case
    text = text.translate(table)
    
    # Tokenization using spaCy
    doc = nlp(text)
    
    # Remove stopwords and lemmization using spaCy's built-in stopword list
    tokens = [token.lemma_ for token in doc if not nlp.vocab[token.text].is_stop]
    
    return " ".join(tokens)




start_time = time.time()
train_essays.loc[:,"text"] = train_essays.loc[:,"text"].map(lambda x : preprocess_text(x))
# Convert back to pandas DataFrame (if needed)
end_time = time.time()

print(end_time - start_time)

1.0530376434326172


## BERT

In [48]:

import tensorflow as tf

from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

def bert_kernel_initializer(stddev=0.02):
    return keras.initializers.TruncatedNormal(stddev=stddev)

X = train_essays["text"].values
y = train_essays["generated"].values



X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Assuming your labels are 0 and 1
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

BATCH_SIZE = 64
SHUFFLE_BUFFER_SIZE = 100
hidden_size = 1024
dropout_prob = 0.2

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)



# Preprocessor
preprocessor = keras_nlp.models.BertPreprocessor.from_preset("bert_large_en_uncased")

train_preprocessed = (
    train_dataset.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
)
test_preprocessed = (
    test_dataset.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
)

# Backbone
backbone = keras_nlp.models.BertBackbone.from_preset("bert_large_en_uncased")

backbone.summary()


for batch in train_preprocessed.as_numpy_iterator():
    # Process your batch here
    print(batch)


backbone.trainable = False
inputs = backbone.input

print(inputs)
pooled = backbone(inputs)["pooled_output"]

x = keras.layers.Dense(hidden_size)(pooled)
x = keras.layers.Dense(hidden_size, activation = 'relu')(x)
x = keras.layers.Dropout(dropout_prob)(x)
x = keras.layers.Dense(hidden_size, activation='relu')(x)
x = keras.layers.Dense(hidden_size,  activation='relu')(x)
x = keras.layers.Dropout(dropout_prob)(x)
x = keras.layers.Dense(hidden_size//8, activation = 'relu')(x)

outputs = keras.layers.Dense(1, activation='sigmoid')(x)


model = keras.Model(inputs, outputs)

model.compile(
    loss=keras.losses.BinaryCrossentropy(from_logits=False),
    optimizer=keras.optimizers.AdamW(5e-5),
    metrics = [keras.metrics.AUC()],
    jit_compile=True,
)

Attaching 'tokenizer.json' from model 'keras/bert/keras/bert_large_en_uncased/1' to your Kaggle notebook...
Attaching 'tokenizer.json' from model 'keras/bert/keras/bert_large_en_uncased/1' to your Kaggle notebook...
Attaching 'assets/tokenizer/vocabulary.txt' from model 'keras/bert/keras/bert_large_en_uncased/1' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/bert/keras/bert_large_en_uncased/1' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/bert/keras/bert_large_en_uncased/1' to your Kaggle notebook...
Attaching 'model.weights.h5' from model 'keras/bert/keras/bert_large_en_uncased/1' to your Kaggle notebook...


Model: "bert_backbone"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 token_ids (InputLayer)      [(None, None)]               0         []                            
                                                                                                  
 token_embedding (Reversibl  (None, None, 1024)           3125452   ['token_ids[0][0]']           
 eEmbedding)                                              8                                       
                                                                                                  
 segment_ids (InputLayer)    [(None, None)]               0         []                            
                                                                                                  
 position_embedding (Positi  (None, None, 1024)           524288    ['token_embedding[

## Training

In [49]:

model.fit(
    train_preprocessed,
    epochs=10
    #class_weight = {0:class_weights[0],1:class_weights[1]}
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d72e81bf3d0>

## Evaluation and submission

In [None]:
test_essays = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")
test_essays.loc[:,"text"] = test_essays.loc[:,"text"].map(lambda x : preprocess_text(x))

submission_dict  = pd.DataFrame()
submission_dict["id"] = test_essays["id"]
submission_dict["generated"] = np.around(classifier.predict(test_essays["text"])[:,0],1)


submission = pd.DataFrame.from_dict(submission_dict)
submission.to_csv("/kaggle/working/submission.csv",index=False)