<a href="https://colab.research.google.com/github/edojatheophilus/AI-vs-Human-Text-Detection/blob/Development/RNN_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## RNN

In [None]:
# Load the compressed pickle file with the PCA with preprocessed text
import pickle
import gzip
with gzip.open('PCA_features_with_text.pkl.gz', 'rb') as f:
    df_with_text = pickle.load(f)

# Now final_processed_data contains our DataFrame with only

In [None]:
df_RNN = df_with_text

In [None]:
df_RNN.head()

Unnamed: 0,text,generated,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,...,PC191,PC192,PC193,PC194,PC195,PC196,PC197,PC198,PC199,PC200
0,Cars. Cars have been around since they became ...,0.0,4.625756,-6.139121,-8.183926,-1.584481,0.898296,-0.745829,1.233761,-2.972068,...,-0.792867,0.592858,0.422301,0.835457,1.464719,-0.104244,1.001682,0.216649,0.407391,-0.430058
1,Transportation is a large necessity in most co...,0.0,8.42592,-6.75508,-7.253139,-1.88809,0.496328,-0.949115,2.146534,-4.172021,...,0.599653,2.219176,1.953602,1.491739,-0.537354,1.725558,1.439251,0.268123,1.51451,0.637347
2,"""Americas love affair with its vehicles seems ...",0.0,5.188977,-6.241041,-2.800961,-1.448331,1.135069,-0.518902,0.916541,-1.691716,...,-0.655248,-0.949802,-0.882912,0.389224,0.282645,-0.538652,1.104475,-0.86345,0.560406,-0.57551
3,How often do you ride in a car? Do you drive a...,0.0,4.84531,-7.421351,-6.481223,-2.967532,0.678225,1.111468,1.77579,-1.344532,...,0.139977,-0.887881,0.292654,-0.51349,0.659874,0.923782,1.119892,-2.126562,-0.276688,-0.407697
4,Cars are a wonderful thing. They are perhaps o...,0.0,4.480856,-6.581754,-8.50383,-3.141797,0.163962,-0.153618,1.306802,-4.705047,...,0.451199,-0.652226,-1.25535,-0.537535,-0.332195,1.6576,-0.542583,1.016093,1.414892,1.157998


In [None]:
# To prepare the data to be used as input for RNN model we need to take steps that would include tokenization and sequence padding

df_shuffled = df_RNN.sample(frac=1, random_state=42).reset_index(drop=True) # Shuffling

df_unseen = df_shuffled[:100]
df_rest = df_shuffled[100:]

texts = df_rest['text'].values  # Extract text data
labels = df_rest['generated'].values  # Extract target variable

unseen_texts = df_unseen['text'].values
unseen_labels = df_unseen['generated'].values

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

max_features = 10000  # Maximum number of words in the vocabulary

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)
unseen_sequences = tokenizer.texts_to_sequences(unseen_texts)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

maxlen = 20  # Maximum length of a sequence
X = pad_sequences(sequences, maxlen=maxlen)
X_unseen = pad_sequences(unseen_sequences, maxlen=maxlen)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [None]:
X_train

array([[  76,   37,   14, ..., 1493,   10,   12],
       [   1,  401,  341, ...,   22,  127,  449],
       [ 769,    5,    2, ...,  925,  304, 4888],
       ...,
       [ 138,   39,   10, ...,    5,    1,  196],
       [  57,   44,   19, ...,   52,    3,  239],
       [  42,  118,    1, ...,  675, 1342,  223]], dtype=int32)

In [None]:
X_train.shape

(389708, 20)

In [None]:
X_unseen.shape

(100, 20)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Model parameters
max_features = 10000  # Size of the vocabulary
maxlen = 20  # Maximum length of a sequence
embedding_dim = 32  # Dimensionality of the embedding layer

# Building the model
model = Sequential()
model.add(Embedding(max_features, embedding_dim, input_length=maxlen))
model.add(SimpleRNN(32))  # 32 units in the RNN layer
model.add(Dense(1, activation='sigmoid'))  # Binary classification

# Compiling the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

# Model summary
model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 20, 32)            320000    
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 322113 (1.23 MB)
Trainable params: 322113 (1.23 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate the model on test data
test_predictions = model.predict(X_test)
test_predicted_classes = (test_predictions > 0.5).astype(int)

# Calculate metrics for the test set
test_accuracy = accuracy_score(y_test, test_predicted_classes)
test_precision = precision_score(y_test, test_predicted_classes)
test_recall = recall_score(y_test, test_predicted_classes)
test_f1 = f1_score(y_test, test_predicted_classes)

print("Test Set Performance:")
print(f"Accuracy: {test_accuracy}")
print(f"Precision: {test_precision}")
print(f"Recall: {test_recall}")
print(f"F1 Score: {test_f1}")
print("\n")

# Evaluate the model on test data
unseen_predictions = model.predict(X_unseen)
unseen_predicted_classes = (unseen_predictions > 0.5).astype(int)

# Calculate metrics for the unseen dataset
unseen_accuracy = accuracy_score(unseen_labels, unseen_predicted_classes)
unseen_precision = precision_score(unseen_labels, unseen_predicted_classes)
unseen_recall = recall_score(unseen_labels, unseen_predicted_classes)
unseen_f1 = f1_score(unseen_labels, unseen_predicted_classes)

print("Unseen Dataset Performance:")
print(f"Accuracy: {unseen_accuracy}")
print(f"Precision: {unseen_precision}")
print(f"Recall: {unseen_recall}")
print(f"F1 Score: {unseen_f1}")

Test Set Performance:
Accuracy: 0.9754072279758178
Precision: 0.9813421090284733
Recall: 0.9516052945583706
F1 Score: 0.9662449635118763


Unseen Dataset Performance:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


In [None]:
import pickle
import gzip


# Save the model to a file
with gzip.open('rnn_model.pkl.gz', 'wb') as f:
    pickle.dump(model, f)
