## Data download with kaggle api


In [None]:
#upload kaggle.json file which is downloaded from the kaggle site for the api key and access
from google.colab import files

files.upload()

In [None]:
# giving permissions to download and unzip the dataset
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# download the dataset
!kaggle datasets download -d dillonwongso/ai-generated-vs-human-text-cleaned

In [None]:
!unzip ai-generated-vs-human-text-cleaned.zip


### Download The Libraries

In [None]:
# Installing libraries needed
!pip install transformers


## model development

### import of libraries


In [None]:
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizerFast
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

### load dataset

In [None]:
# Load the dataset
data = pd.read_csv("preprocessed-50k.csv")

print(data.head())

In [None]:
data['source'] = data['source'].map({'human': 1, 'ai': 0})

In [None]:
texts = data['text'].values
labels = data['source'].values

### initialize tokenizer

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def tokenize(texts, tokenizer, batch_size=10000, max_length=256):
    n = len(texts)
    print(f"Total texts: {n}")
    all_input_ids = []
    all_attention_masks = []

    for i in range(0, n, batch_size):
        print(f"Processing batch {i // batch_size + 1}")
        batch = texts[i:i + batch_size]
        batch_encoding = tokenizer(
            list(batch),
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_tensors="tf"
        )
        all_input_ids.append(batch_encoding['input_ids'])
        all_attention_masks.append(batch_encoding['attention_mask'])

    # Concatenate all batches into a single tensor
    return {
        "input_ids": tf.concat(all_input_ids, axis=0),
        "attention_mask": tf.concat(all_attention_masks, axis=0)
    }


### splitting the data

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

### model development and compiling

In [None]:
train_encodings = tokenize(train_texts, tokenizer)
test_encodings = tokenize(test_texts, tokenizer)


In [None]:
train_encodings = {key: value.numpy() for key, value in train_encodings.items()}
print({key: len(value) for key, value in train_encodings.items()})


In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(len(train_labels)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
)).batch(16)

In [None]:
model = TFBertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])


### model

In [None]:
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=4
)

### predictions

In [None]:
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test Accuracy: {test_accuracy}")

In [None]:
predictions = model.predict(test_dataset).logits
predicted_classes = np.argmax(predictions, axis=1)

print(classification_report(test_labels, predicted_classes))

### saving the model

In [None]:
model.save_pretrained("ai_human_classifier")
tokenizer.save_pretrained("ai_human_classifier")

In [None]:
!zip -r ai_human_classifier.zip ai_human_classifier

In [None]:
from google.colab import files

files.download("ai_human_classifier.zip")