In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

data = pd.read_csv('../cleaned_data_final.csv')

# Transforming the data so that each row contains one response and one label
responses = data['Human_response'].tolist() + data['ChatGPT_3_5_response'].tolist()
labels = [0] * len(data['Human_response']) + [1] * len(data['ChatGPT_3_5_response'])
data_combined = pd.DataFrame({'response': responses, 'label': labels})

# Text tokenization
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data_combined['response'].values)
X = tokenizer.texts_to_sequences(data_combined['response'].values)
X = pad_sequences(X)

labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data_combined['label'])
y = to_categorical(integer_encoded)

# Function to create the model, required for KerasClassifier
def create_model():
    embed_dim = 128
    lstm_out = 196
    model = Sequential()
    model.add(Embedding(max_features, embed_dim, input_length=X.shape[1]))
    model.add(SpatialDropout1D(0.4))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Wrap the model
model = KerasClassifier(build_fn=create_model, epochs=7, batch_size=32, verbose=2)

# Perform cross-validation
cross_val_scores = cross_val_score(model, X, y, cv=5)  # 5-fold cross-validation

print("Cross-validation Scores:", cross_val_scores)
print("Average Cross-validation Accuracy:", cross_val_scores.mean())



  model = KerasClassifier(build_fn=create_model, epochs=7, batch_size=32, verbose=2)


Epoch 1/7
50/50 - 59s - loss: 0.3259 - accuracy: 0.8444 - 59s/epoch - 1s/step
Epoch 2/7
50/50 - 55s - loss: 0.0599 - accuracy: 0.9781 - 55s/epoch - 1s/step
Epoch 3/7
50/50 - 54s - loss: 0.0423 - accuracy: 0.9894 - 54s/epoch - 1s/step
Epoch 4/7
50/50 - 53s - loss: 0.0400 - accuracy: 0.9856 - 53s/epoch - 1s/step
Epoch 5/7
50/50 - 56s - loss: 0.0128 - accuracy: 0.9969 - 56s/epoch - 1s/step
Epoch 6/7
50/50 - 55s - loss: 0.0038 - accuracy: 0.9994 - 55s/epoch - 1s/step
Epoch 7/7
50/50 - 55s - loss: 0.0018 - accuracy: 0.9994 - 55s/epoch - 1s/step
13/13 - 4s - loss: 0.9105 - accuracy: 0.8375 - 4s/epoch - 270ms/step
Epoch 1/7
50/50 - 60s - loss: 0.3533 - accuracy: 0.8350 - 60s/epoch - 1s/step
Epoch 2/7
50/50 - 57s - loss: 0.0887 - accuracy: 0.9688 - 57s/epoch - 1s/step
Epoch 3/7
50/50 - 55s - loss: 0.0376 - accuracy: 0.9862 - 55s/epoch - 1s/step
Epoch 4/7
50/50 - 56s - loss: 0.0283 - accuracy: 0.9944 - 56s/epoch - 1s/step
Epoch 5/7
50/50 - 56s - loss: 0.0082 - accuracy: 0.9975 - 56s/epoch - 1s/