# LLM generation text detection 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import keras
import keras_nlp
import re
import spacy
import time
import string


train_essays = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")
train_prompts = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv")
train_daigt = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv")




## Datasets 

In [None]:
train_daigt["generated"] = train_daigt["label"]
train_essays = train_essays.merge(train_prompts, on='prompt_id', how='inner')

train_essays = pd.concat([train_essays,train_daigt])

#train_essays.loc[:,["prompt_name","generated"]].value_counts()
train_essays = train_essays.sample(frac = 1)

train_essays = train_essays.loc[:,["text","generated"]]



## Preprocessing

In [None]:
# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

table = str.maketrans("", "", string.punctuation)
table[10] = None #\n
table[92] = None #\


#to lower
for code in range(26):
    table[code + 65] = code +97




def preprocess_text(text):
    
    #characters level preprocessing
    #remove \n and \, remove puntuactions, to lower case
    text = text.translate(table)
    
    # Tokenization using spaCy
    doc = nlp(text)
    
    # Remove stopwords and lemmization using spaCy's built-in stopword list
    tokens = [token.lemma_ for token in doc if not nlp.vocab[token.text].is_stop]
    
    return " ".join(tokens)




start_time = time.time()
train_essays.loc[:,"text"] = train_essays.loc[:,"text"].map(lambda x : preprocess_text(x))
# Convert back to pandas DataFrame (if needed)
end_time = time.time()

print(end_time - start_time)

In [None]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

X = train_essays["text"].values
y = train_essays["generated"].values


#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.0, stratify=y, random_state=42)

# Assuming your labels are 0 and 1
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)

# Pretrained classifier.
classifier = keras_nlp.models.BertClassifier.from_preset(
    "bert_large_en_uncased",
    num_classes=1,
)



# Access backbone programmatically (e.g., to change `trainable`).
classifier.backbone.trainable = False

#grid search



classifier.compile(
    loss = keras.losses.BinaryCrossentropy(),
    metrics=[keras.metrics.AUC()],
    optimizer = keras.optimizers.Adam(4e-4),
    jit_compile = True,
)



history = classifier.fit(
    x = X,
    y = y,
    batch_size = 64,
    epochs = 10,  # Set the number of epochs as required
    class_weight = {0:class_weights[0],1:class_weights[1]}
)




## Evaluation and submission

In [None]:
test_essays = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")
submission_dict  = pd.DataFrame()

submission_dict["id"] = test_essays["id"]
submission_dict["generated"] = np.around(classifier.predict(test_essays["text"])[:,0],1)


submission = pd.DataFrame.from_dict(submission_dict)
submission.to_csv("/kaggle/working/submission.csv",index=False)