In [1]:
!pip install --upgrade keras-nlp

Collecting keras-nlp
  Downloading keras_nlp-0.9.3-py3-none-any.whl (508 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m508.4/508.4 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras-core (from keras-nlp)
  Downloading keras_core-0.1.7-py3-none-any.whl (950 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-text (from keras-nlp)
  Downloading tensorflow_text-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
Collecting namex (from keras-core->keras-nlp)
  Downloading namex-0.0.7-py3-none-any.whl (5.8 kB)
Collecting tensorflow<2.17,>=2.16.1 (from tensorflow-text->keras-nlp)
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Import statements
import pandas as pd
import tensorflow as tf
import keras_nlp
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 200)

### Read Data 
This notebook was executed in Colab. Ensure that you upload the data under sample_data before running the below cells.

In [5]:

X_train = pd.read_csv("/content/sample_data/X_train_essays.csv")
Y_train = pd.read_csv("/content/sample_data/Y_train_essays.csv")
data = pd.concat([X_train, Y_train], axis=1)

In [6]:
#Drop Duplicates and get rid of NA values
data = data.drop_duplicates()
data = data.dropna()
data = data.sample(frac = 0.4) # we only use a 40% of the data for training to save time
data.groupby('generated').count()

Unnamed: 0_level_0,Unnamed: 0,text,Unnamed: 0
generated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,11457,11457,11457
1,41588,41588,41588


### Training Data Split to check validation

In [7]:

x_train, x_test, y_train, y_test = train_test_split(data['text'], data['generated'], test_size=0.1)

### Model Training

In [9]:

# Embedding Layer
preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
    "bert_small_en_uncased",
    sequence_length=512)
# Classifier Layer
classifier = keras_nlp.models.BertClassifier.from_preset(
    "bert_small_en_uncased",
    num_classes=2,
    preprocessor=preprocessor)
#  Compiling the model 
classifier.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(5e-5),
    jit_compile=True)
# Training using 1 epoch to save compute resources 
classifier.fit(x=x_train, y=y_train, batch_size=16, epochs=1, validation_data=(x_test, y_test))

Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_small_en_uncased/2/download/tokenizer.json...
100%|██████████| 547/547 [00:00<00:00, 1.46MB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_small_en_uncased/2/download/assets/tokenizer/vocabulary.txt...
100%|██████████| 226k/226k [00:00<00:00, 1.05MB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_small_en_uncased/2/download/config.json...
100%|██████████| 508/508 [00:00<00:00, 1.28MB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_small_en_uncased/2/download/model.weights.h5...
100%|██████████| 110M/110M [00:04<00:00, 27.1MB/s]


[1m2984/2984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m857s[0m 266ms/step - loss: 0.4415 - sparse_categorical_accuracy: 0.7983 - val_loss: 0.2792 - val_sparse_categorical_accuracy: 0.8745


<keras.src.callbacks.history.History at 0x798170466800>

In [10]:
x_test = pd.read_csv("/content/sample_data/X_test_essays.csv")
x_test

Unnamed: 0.1,Unnamed: 0,text
0,15977,dear state senator presently electoral college determines election president united states system controversial ultimately believe fair representation people letter argue electoral college essenti...
1,77973,scientist nasa discusesing face someone thinks landform would present argument convince reality face mars merely natural landform alien face evidence points claim introduction 1976 nasa viking mis...
2,48941,venus hostile would hitty study trip mars mars orbit well neurovefrence humans entering new visitors venus impossible periditanlly conditions could cold cold zone happened common ended dozen natio...
3,36935,seagoing cowboys great opportunity travel help people go trips also fun helping countries food water clothing see kinds animals like whales fish done trip get nice things souvenirs like jewelry ma...
4,123863,dear principal think change grade policy b average less students playing sports participating school activities reason say many students averages like school activity make average kids play sports...
...,...,...
33149,94193,sitting car go somewhere question going answer text car moving cellphones useful used appropriate times cases even seen certain activities like driving school work let phones control us rather con...
33150,150248,believe people trust instincts follow beliefs sets apart others important confidence oneself make decisions based one thinks right rather swayed opinions others fear judgment firstly important fai...
33151,60292,facial action coding system enables computer identify human emotion think good idea tell someone upset intall technology see see sudents safe ever shows signs human emotion would tell someone waaa...
33152,152534,ask right people tell quote book called giver talking asking many different people advice heapful sometimes good ask one person everyone opinion things could help make choice easier asking people ...


# Evaluation

In [12]:
import numpy as np
predictions = classifier.predict(x_test["text"].to_numpy())
values = tf.nn.softmax(predictions)[:, 1].numpy()
predictions = np.where(values > 0.5, 1, 0)
predictions

[1m1037/1037[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 169ms/step


array([1, 1, 1, ..., 1, 1, 1])

In [13]:
y_test = pd.read_csv("/content/sample_data/Y_test_esssays.csv")
y_test = y_test["generated"].to_numpy()

In [14]:
from sklearn.metrics import confusion_matrix
confusion_matrix_values = confusion_matrix(y_test, predictions)

# Accuracy, Precision, Recall

In [15]:
TP = confusion_matrix_values[0, 0]
FP = confusion_matrix_values[0, 1]
FN = confusion_matrix_values[1, 0]
TN = confusion_matrix_values[1, 1]

# Calculating metrics
accuracy = (TP + TN) / np.sum(confusion_matrix_values)
precision = TP / (TP + FP)
recall = TP / (TP + FN)

# Displaying the metrics in a fancy way
metrics_display = f"""
Precision: {precision:.2f}
Recall: {recall:.2f}
Accuracy: {accuracy:.2f}
"""

metrics_display


'\nPrecision: 0.58\nRecall: 0.78\nAccuracy: 0.87\n'