# LLM generation text detection

In [1]:
!pip install transformers
!pip install tensorflow_text





In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

train_essays = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")
train_prompts = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv")
train_daigt = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv")


#train_prompts.head()

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Datasets aligment

In [3]:
train_daigt["generated"] = train_daigt["label"]
train_essays = train_essays.merge(train_prompts, on='prompt_id', how='inner')

train_essays = pd.concat([train_essays,train_daigt])

#train_essays.loc[:,["prompt_name","generated"]].value_counts()
train_essays = train_essays.sample(frac = 1)

train_essays = train_essays.loc[:,["text","generated"]]


## Preprocessing

In [4]:
import re
import spacy
import time
import string


# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

table = str.maketrans("", "", string.punctuation)
table[10] = None #\n
table[92] = None #\


#to lower
for code in range(26):
    table[code + 65] = code +97




def preprocess_text(text):
    
    #characters level preprocessing
    #remove \n and \, remove puntuactions, to lower case
    text = text.translate(table)
    
    # Tokenization using spaCy
    doc = nlp(text)
    
    # Remove stopwords and lemmization using spaCy's built-in stopword list
    tokens = [token.lemma_ for token in doc if not nlp.vocab[token.text].is_stop]
    
    return " ".join(tokens)




start_time = time.time()
train_essays.loc[:,"text"] = train_essays.loc[:,"text"].map(lambda x : preprocess_text(x))
# Convert back to pandas DataFrame (if needed)
end_time = time.time()

print(end_time - start_time)



2589.328759431839


## BERT

In [5]:
import keras
import keras_nlp
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

X = train_essays["text"].values
y = train_essays["generated"].values


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Assuming your labels are 0 and 1
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)


preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
    "bert_large_en_uncased",
)

# Pretrained classifier without preprocessing.
classifier = keras_nlp.models.BertClassifier.from_preset(
    "bert_large_en_uncased",
    num_classes = 1,
    preprocessor = preprocessor,
    
)
# Access backbone programmatically (e.g., to change `trainable`).
classifier.backbone.trainable = False

classifier.compile(
    loss = keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer = keras.optimizers.Adam(5e-5),
    jit_compile = True,
)

history = classifier.fit(
    x = X_train,
    y = y_train,
    validation_data = (X_val, y_val),
    batch_size = 64,
    epochs = 10,  # Set the number of epochs as required
    class_weight = {0:class_weights[0],1:class_weights[1]}
)


Using TensorFlow backend
Downloading data from https://storage.googleapis.com/keras-nlp/models/bert_large_en_uncased/v1/vocab.txt
Downloading data from https://storage.googleapis.com/keras-nlp/models/bert_large_en_uncased/v1/model.h5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Evaluation and submission

In [6]:
test_essays = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")
submission_dict  = pd.DataFrame()

submission_dict["id"] = test_essays["id"]
submission_dict["generated"] = classifier.predict(test_essays["text"])[:,0]

submission = pd.DataFrame.from_dict(submission_dict)
submission.to_csv("/kaggle/working/submission.csv")

    


