In [1]:
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, MaxPooling1D, Dense, Concatenate, GlobalMaxPooling1D, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import numpy as np

# Initialize the Okt tokenizer
okt = Okt()

# Example Korean texts
headline_texts = ["이것은 클릭베이트 감지 모델입니다.", "한글 텍스트 임베딩 예제입니다."]
body_texts = ["여기에서 모델의 성능을 테스트합니다.", "다양한 한국어 텍스트를 사용합니다."]
labels = [1, 0]  # Example binary labels

# Tokenize texts
headline_tokenized = [' '.join(okt.morphs(text)) for text in headline_texts]
body_tokenized = [' '.join(okt.morphs(text)) for text in body_texts]

# Initialize the Keras tokenizer
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(headline_tokenized + body_tokenized)

# Convert texts to sequences
headline_sequences = tokenizer.texts_to_sequences(headline_tokenized)
body_sequences = tokenizer.texts_to_sequences(body_tokenized)

# Pad sequences
maxlen = 100  # Maximum length of sequences
headline_padded = pad_sequences(headline_sequences, maxlen=maxlen)
body_padded = pad_sequences(body_sequences, maxlen=maxlen)

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Define model parameters
embedding_dim = 128  # Embedding dimensions
lstm_units = 64  # LSTM units
dropout_rate = 0.5  # Dropout rate
l2_lambda = 0.01  # L2 regularization factor

# Split the data into training and testing sets
X_headline_train, X_headline_test, X_body_train, X_body_test, y_train, y_test = train_test_split(
    headline_padded, body_padded, labels, test_size=0.2, random_state=42
)

# Headline Model
headline_input = Input(shape=(maxlen,), name='headline_input')
headline_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(headline_input)
headline_bilstm1 = Bidirectional(LSTM(units=lstm_units, return_sequences=True))(headline_embedding)
headline_maxpool1 = MaxPooling1D(pool_size=2)(headline_bilstm1)
headline_dropout1 = Dropout(dropout_rate)(headline_maxpool1)
headline_bilstm2 = Bidirectional(LSTM(units=lstm_units, return_sequences=True))(headline_dropout1)
headline_maxpool2 = MaxPooling1D(pool_size=2)(headline_bilstm2)  # Apply MaxPooling over the LSTM output
headline_global_maxpool = GlobalMaxPooling1D()(headline_maxpool2)
headline_dropout2 = Dropout(dropout_rate)(headline_global_maxpool)

# Body Model
body_input = Input(shape=(maxlen,), name='body_input')
body_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(body_input)
body_bilstm1 = Bidirectional(LSTM(units=lstm_units, return_sequences=True))(body_embedding)
body_maxpool1 = MaxPooling1D(pool_size=2)(body_bilstm1)
body_dropout1 = Dropout(dropout_rate)(body_maxpool1)
body_bilstm2 = Bidirectional(LSTM(units=lstm_units, return_sequences=True))(body_dropout1)
body_maxpool2 = MaxPooling1D(pool_size=2)(body_bilstm2)  # Apply MaxPooling over the LSTM output
body_global_maxpool = GlobalMaxPooling1D()(body_maxpool2)
body_dropout2 = Dropout(dropout_rate)(body_global_maxpool)

# Concatenate Headline and Body Models
concatenated = Concatenate()([headline_dropout2, body_dropout2])

# Fully Connected Layers with L2 Regularization
hidden1 = Dense(units=128, activation='relu', kernel_regularizer=l2(l2_lambda))(concatenated)
hidden_dropout1 = Dropout(dropout_rate)(hidden1)
hidden2 = Dense(units=64, activation='relu', kernel_regularizer=l2(l2_lambda))(hidden_dropout1)
hidden_dropout2 = Dropout(dropout_rate)(hidden2)

# Output Layer for binary classification
output = Dense(units=1, activation='sigmoid')(hidden_dropout2)

# Model
model = Model(inputs=[headline_input, body_input], outputs=output)

# Compile the Model with binary crossentropy
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Summary of the Model
model.summary()

# Convert labels to numpy array
y_train = np.array(y_train)
y_test = np.array(y_test)

# Early Stopping Callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model with early stopping
model.fit(
    [X_headline_train, X_body_train], y_train,
    epochs=50, validation_split=0.2, callbacks=[early_stopping]
)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate([X_headline_test, X_body_test], y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')


ValueError: Training data contains 1 samples, which is not sufficient to split it into a validation and training set as specified by `validation_split=0.2`. Either provide more data, or a different value for the `validation_split` argument.