# LLM generation text detection

In [2]:
!pip install transformers
!pip install tensorflow_text




In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

train_essays = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")
train_prompts = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv")
train_daigt = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv")

train_essays.head()
#train_prompts.head()

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0


## Datasets aligment

In [4]:
train_daigt["generated"] = train_daigt["label"]
train_essays = train_essays.merge(train_prompts, on='prompt_id', how='inner')

train_essays = pd.concat([train_essays,train_daigt])

train_essays.loc[:,["prompt_name","generated"]].value_counts()

prompt_name                            generated
Seeking multiple opinions              1            3624
Distance learning                      1            3397
Does the electoral college work?       0            3382
Car-free cities                        0            3373
Facial action coding system            0            2167
Distance learning                      0            2157
Car-free cities                        1            2052
Driverless cars                        0            1886
Exploring Venus                        0            1862
Summer projects                        0            1750
Does the electoral college work?       1            1722
Mandatory extracurricular activities   0            1670
Cell phones at school                  0            1656
Grades for extracurricular activities  0            1626
The Face on Mars                       0            1583
Seeking multiple opinions              0            1552
Community service                      

## Text preprocessing

In [5]:
import re
import spacy


# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    # Convert to lowercase  
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    
    # Tokenization using spaCy
    doc = nlp(text)
    tokens = [token.text for token in doc]
    
    # Remove stopwords using spaCy's built-in stopword list
    tokens = [token for token in tokens if not nlp.vocab[token].is_stop]
    
    # Lemmatization
    tokens = [token.lemma_ for token in nlp(" ".join(tokens))]
    
    # Join tokens back into a string
    processed_text = ' '.join(tokens)
    
    return processed_text

#train_essays["bert_input"] = train_essays["text"].map(lambda x : preprocess_text(x))



## Preprocessing and Tokenization

In [8]:
from transformers import BertTokenizer
from keras_nlp.models import BertPreprocessor
import keras

class BertCustomPreprocessor(BertPreprocessor):
    def __init__(self, sequence_length, tokenizer, preprocessing, truncate, df, **kwargs):
        super().__init__(tokenizer, sequence_length, truncate)
        self.preprocessing = preprocessing
        self.df = df

    def to_bert_input(self, batch):
        batch_prep = []
        for b in batch:
            if self.df["bert_input"][b] == "":
                self.df["bert_input"][b] = self.preprocessing(self.df["text"][b])
            batch_prep.append(self.df["bert_input"][b])
        return
                

    def __call__(self, x, y=None, sample_weight=None):
        x["bert_input"] = x.apply(lambda x: self.to_bert_input(x), axis=1)
        x.bert_input = x.bert_input.astype(str)
        return super(BertCustomPreprocessor,self).__call__(x["bert_input"].tolist(), y, sample_weight)

class PreprocessCallback(keras.callbacks.Callback):
    
    def __init__(self, preprocessing, sequence_length, truncate):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.truncate = truncate
        
    def on_epoch_begin(self, epoch, logs=None):
        # Check and preprocess the training data
        X_train["input"] = X_train.apply(lambda row : to_model_input(row))
        
    def to_model_input(row):
        
        if row["input"] == None:
            return self.preprocess_and_tokenize(row["text"])
        else:
            return row["input"]

    def preprocess_and_tokenize(self, text):
        import pdb;pdb.set_trace()
        return self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=self.truncation_strategy
        )





## BERT

In [6]:
### grid_search

batch_size = [32,64]
learning_rate = [1e-4,1e-5]
trainable_backbone = [True,False]

In [None]:
import keras
import keras_nlp
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

X = train_essays["text"].values
y = train_essays["generated"].values
import pdb;pdb.set_trace()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Assuming your labels are 0 and 1
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)


# Pretrained classifier without preprocessing.
classifier = keras_nlp.models.BertClassifier.from_preset(
    "bert_large_en_uncased",
    num_classes=2,
    preprocessor=None,
    
)
# Access backbone programmatically (e.g., to change `trainable`).
classifier.backbone.trainable = False

classifier.compile(
    loss = keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(5e-5),
    jit_compile=True,
)

history = classifier.fit(
    x=X_train,
    y=y_train,
    validation_data=(X_val, y_val),
    batch_size=64,
    epochs=10,  # Set the number of epochs as required
    class_weight=class_weights
)


--Return--
None
> [0;32m/tmp/ipykernel_42/1569331090.py[0m(9)[0;36m<module>[0;34m()[0m
[0;32m      7 [0;31m[0mX[0m [0;34m=[0m [0mtrain_essays[0m[0;34m[[0m[0;34m"text"[0m[0;34m][0m[0;34m.[0m[0mvalues[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      8 [0;31m[0my[0m [0;34m=[0m [0mtrain_essays[0m[0;34m[[0m[0;34m"generated"[0m[0;34m][0m[0;34m.[0m[0mvalues[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 9 [0;31m[0;32mimport[0m [0mpdb[0m[0;34m;[0m[0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     10 [0;31m[0;34m[0m[0m
[0m[0;32m     11 [0;31m[0mX_train[0m[0;34m,[0m [0mX_val[0m[0;34m,[0m [0my_train[0m[0;34m,[0m [0my_val[0m [0;34m=[0m [0mtrain_test_split[0m[0;34m([0m[0mX[0m[0;34m,[0m [0my[0m[0;34m,[0m [0mtest_size[0m[0;34m=[0m[0;36m0.2[0m[0;34m,[0m [0mstratify[0m[0;34m=[0m[0my[0m[0;34m,[0m [0mrandom_state[0m[0;34m=[0m[0;36m42[0m[0;34m)[0m[0

ipdb>  type(X)


<class 'numpy.ndarray'>


ipdb>  X.shape


(46246,)


ipdb>  X[0]


'Cars. Cars have been around since they became famous in the 1900s, when Henry Ford created and built the first ModelT. Cars have played a major role in our every day lives since then. But now, people are starting to question if limiting car usage would be a good thing. To me, limiting the use of cars might be a good thing to do.\n\nIn like matter of this, article, "In German Suburb, Life Goes On Without Cars," by Elizabeth Rosenthal states, how automobiles are the linchpin of suburbs, where middle class families from either Shanghai or Chicago tend to make their homes. Experts say how this is a huge impediment to current efforts to reduce greenhouse gas emissions from tailpipe. Passenger cars are responsible for 12 percent of greenhouse gas emissions in Europe...and up to 50 percent in some carintensive areas in the United States. Cars are the main reason for the greenhouse gas emissions because of a lot of people driving them around all the time getting where they need to go. Article

ipdb>  y.shape


(46246,)


ipdb>  y[0[


*** SyntaxError: '[' was never closed


ipdb>  y[0]


0


ipdb>  y[1]


0


ipdb>  np.where(y == 1)


(array([  648,  1039,  1057, ..., 46243, 46244, 46245]),)


ipdb>  y[648]


1
--KeyboardInterrupt--

KeyboardInterrupt: Interrupted by user


In [None]:
## Evaluation and submission

In [None]:
test_essays = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")
submission_dict = {"id":[],"generated":[]}

for _,row in test_essays.iterrows():
    submission_dict["id"].append(row["id"])
    submission_dict["generated"].append(classifier.predict(row["text"])[1])

submission = pd.DataFrame.from_dict(submission_dict)
submission.to_csv("/kaggle/working/submission.csv")

    
