In [None]:
import pandas as pd
# load dataset
testdata = pd.read_csv("cleaned_dataset.csv")
testdata.head()


This shows the dataset is in the format required to begin feature extraction. The format for the answer column is in the order
of human response followed by GPT response and continues for each of the questions in the dataset. The results column gives a 
binary value to correlate human vs GPT responses.

Result Column:
    0 - Human Response
    1 - GPT Generated Response

In [None]:
import pandas as pd
import tensorflow as tf

# Load your .csv file into a DataFrame
df = pd.read_csv('cleaned_dataset.csv')

questions = df['question']
answers = df['answer']

# Initialize Tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer()

# Fit Tokenizer on questions and answers
tokenizer.fit_on_texts(questions + answers)

# Tokenize questions and answers
tokenized_questions = tokenizer.texts_to_sequences(questions)
tokenized_answers = tokenizer.texts_to_sequences(answers)

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1


Testing the Tokenizer for tensorflow preprocessing is successful in generating a tokenized version of each question and human and AI generated response. The next step in this process is to decide what features to use, and how to accurately perform the feature extraction of these tokenized responses.

Possible Candidates for Features:
1. Word Embeddings with padded sequences
2. BERT pre-trained model 
3. Bag-of-Words Representation
4. N-gram features
5. Statistical features such as sequence length, average word length, etc.

From the research I have done so far with the project, I believe using both word embeddings and BERT to capture a contextual understanding, semantic similarity, sentence-level representation, and handling of variablity in the responses should be sufficient in order to train a model to detect, with a reasonable accuracy, if a response is human or GPT generated. 

In order to use the tokenized responses in the model, the sequences need to be the same length.

In [None]:
import numpy as np

max_len = int(vocab_size/30)
padded_questions = tf.keras.preprocessing.sequence.pad_sequences(tokenized_questions, maxlen=max_len, padding='post')
padded_answers = tf.keras.preprocessing.sequence.pad_sequences(tokenized_answers, maxlen=max_len, padding='post')

# Define input_data and labels based on padded_questions and padded_answers
input_data = np.concatenate((padded_questions, padded_answers), axis=1)
labels = df['result'].values
# Define a generator function to yield batches of data
#def data_generator(padded_questions, padded_answers, labels, batch_size):
 #   num_samples = len(labels)
 #   indices = np.arange(num_samples)
  #  np.random.shuffle(indices)  # Shuffle indices for randomness

 #   while True:
   #     for start_idx in range(0, num_samples, batch_size):
   #         end_idx = min(start_idx + batch_size, num_samples)
   #         batch_indices = indices[start_idx:end_idx]
    #        yield {'question_input': padded_questions[batch_indices], 'answer_input': padded_answers[batch_indices]}, labels[batch_indices]


In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, Concatenate
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Define input layers for questions and answers separately
question_input = Input(shape=(max_len,), dtype='int32', name='question_input')
answer_input = Input(shape=(max_len,), dtype='int32', name='answer_input')

# Embedding layers for questions and answers
embedding_dim = 50
question_embedding = Embedding(vocab_size, embedding_dim, input_length=max_len)(question_input)
answer_embedding = Embedding(vocab_size, embedding_dim, input_length=max_len)(answer_input)

# LSTM layers for questions and answers
question_lstm = LSTM(32)(question_embedding)
answer_lstm = LSTM(32)(answer_embedding)

# Concatenate the outputs of LSTM layers
concatenated = Concatenate()([question_lstm, answer_lstm])

# Output layer
output = Dense(1, activation='sigmoid')(concatenated)

# Create the model
model = tf.keras.Model(inputs=[question_input, answer_input], outputs=output)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Define batch size and epochs
batch_size = 128  # Increased batch size for better GPU utilization
epochs = 3

# Train the model with early stopping
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)]  # Stop training if validation loss doesn't improve
model.fit({'question_input': padded_questions, 'answer_input': padded_answers}, labels, 
          batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbacks, verbose=1)