# LLM generation text detection 

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import keras
import keras_nlp
import re
import spacy
import time
import string


train_essays = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")
train_prompts = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv")
train_daigt_v2 = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv")
train_daigt_external = pd.read_csv("/kaggle/input/daigt-external-train-dataset/train_external_drcat_02.csv")
train_daigt_v3_01 = pd.read_csv("/kaggle/input/daigt-v3-train-dataset/train_v3_drcat_01.csv")
train_daigt_v3_02 = pd.read_csv("/kaggle/input/daigt-v3-train-dataset/train_v3_drcat_02.csv")




Using TensorFlow backend


  train_daigt_external = pd.read_csv("/kaggle/input/daigt-external-train-dataset/train_external_drcat_02.csv")


In [None]:
train_daigt_external.columns

## Datasets 

In [3]:
train_daigt_v2["generated"] = train_daigt_v2["label"].astype(int)
train_daigt_external = train_daigt_external.dropna(subset=["label"])
train_daigt_external["generated"] = train_daigt_external["label"].astype(int)

train_daigt_v3_01["generated"] = train_daigt_v3_01["label"].astype(int)
train_daigt_v3_02["generated"] = train_daigt_v3_02["label"].astype(int)

train_essays = train_essays.merge(train_prompts, on='prompt_id', how='inner')

train_essays = pd.concat([train_essays,train_daigt_v2,train_daigt_external,train_daigt_v3_01,train_daigt_v3_02])

train_essays = train_essays.drop_duplicates(subset=['text'])
train_essays.reset_index(drop=True, inplace=True)
#train_essays.loc[:,["prompt_name","generated"]].value_counts()

train_essays = train_essays.loc[:,["text","generated"]]




train_essays = train_essays[train_essays['text'].apply(lambda x: isinstance(x, str))]

train_essays = train_essays.sample(frac = 1)




train_essays = train_essays[train_essays['text'].apply(lambda x: isinstance(x, str))]

train_essays = train_essays.sample(frac = 1)


## Preprocessing

In [4]:
from multiprocessing import Pool

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

table = str.maketrans("", "", string.punctuation)
table[10] = None #\n
table[92] = None #\


#to lower
for code in range(26):
    table[code + 65] = code +97




def preprocess_text(text):
    
    #characters level preprocessing
    #remove \n and \, remove puntuactions, to lower case
    text = text.translate(table)
    
    # Tokenization using spaCy
    doc = nlp(text)
    
    # Remove stopwords and lemmization using spaCy's built-in stopword list
    tokens = [token.lemma_ for token in doc if not nlp.vocab[token.text].is_stop]
    
    return " ".join(tokens)




start_time = time.time()
# Number of parallel processes (adjust according to your CPU cores)
num_processes = 4

# Split the DataFrame into chunks
chunks = np.array_split(train_essays.loc[:,"text"], num_processes)

# Function to apply to each chunk in parallel
def parallel_map(chunk):
    return chunk.map(lambda x : preprocess_text(x))


# Initialize a Pool for parallel processing
with Pool(num_processes) as pool:
    # Use map function to apply the parallel_map function to each chunk
    results = pool.map(parallel_map, chunks)

# Concatenate the results back into a single DataFrame
train_essays["text"] = pd.concat(results, axis=0)


#train_essays.loc[:,"text"] = train_essays.loc[:,"text"].map(lambda x : preprocess_text(x))
# Convert back to pandas DataFrame (if needed)
end_time = time.time()

print(end_time - start_time)

  return bound(*args, **kwds)


281.1146728992462


## DistilBERT

In [4]:
import tensorflow as tf

from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


print(train_essays.columns)

X = train_essays["text"].values
y = train_essays["generated"].values





#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Assuming your labels are 0 and 1
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)


batch_size = 32
dropout_prob = 0.2


# Preprocessor
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset("distil_bert_base_en_uncased",
                                                            sequence_length=512,
                                                                  truncate="waterfall")


# Pretrained classifier.
classifier = keras_nlp.models.DistilBertClassifier.from_preset(
    "distil_bert_base_en_uncased",
    preprocessor,
    dropout = dropout_prob,
    num_classes=1
)




# Access backbone programmatically (e.g., to change `trainable`).
classifier.backbone.trainable = True

for layer in classifier.backbone.layers[:7]:
    layer.trainable = False
    
classifier.summary()

Attaching 'tokenizer.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/1' to your Kaggle notebook...
Attaching 'tokenizer.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/1' to your Kaggle notebook...
Attaching 'assets/tokenizer/vocabulary.txt' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/1' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/1' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/1' to your Kaggle notebook...
Attaching 'model.weights.h5' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/1' to your Kaggle notebook...
Attaching 'tokenizer.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/1' to your Kaggle notebook...
Attaching 'assets/tokenizer/vocabulary.txt' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/1' to your Kaggle notebook...


## Training

In [None]:
early_stopping = keras.callbacks.EarlyStopping(monitor='auc',
                                         min_delta = 0.01,
                                         patience = 1,
                                         mode = "max",
                                         restore_best_weights = True)

classifier.compile(
            loss=keras.losses.BinaryCrossentropy(from_logits=False),
            optimizer=keras.optimizers.AdamW(1e-6),
            metrics = [keras.metrics.AUC()],
            jit_compile=True,
        )

classifier.fit(
    x = X,
    y = y,
    batch_size = batch_size,
    epochs = 4,
    callbacks = [early_stopping],
    class_weight = {0:class_weights[0],1:class_weights[1]}
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
 24/313 [=>............................] - ETA: 4:07 - loss: 0.2368 - auc: 0.9766

## Evaluation and submission

In [None]:
test_essays = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")
test_essays.loc[:,"text"] = test_essays.loc[:,"text"].map(lambda x : preprocess_text(x))
submission_dict  = pd.DataFrame()
submission_dict["id"] = test_essays["id"]
submission_dict["generated"] = classifier.predict(test_essays["text"])[:,0]


submission = pd.DataFrame.from_dict(submission_dict)
submission.to_csv("/kaggle/working/submission.csv",index=False)