<a href="https://www.kaggle.com/code/muhammadnadhifn/lmsys-chatbot-preference-predictions-dnn?scriptVersionId=203018137" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

Task: Classify between two given prompt on two model
<br><br>
Deep Neural Network Solution
1. Bidirectional LSTM -> extract feature of prompt and respond/answer
2. Attention Mechanism -> captures the relevance or correlation between the prompt and each response
3. Global Pooling -> reduces the sequence of attention-weighted features while retaining the most significant information, enabling the dense layers to make a final classification.
4. Dense/Hidden Layers -> perform the classification

In [None]:
import keras
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.corpus import stopwords
from matplotlib import pyplot as plt

In [None]:
# Load Train Data
df = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')

# Download Stopwords
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Define Function to Remove Stopwords
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Parse String and Ensure all text data is encoded to UTF-8
for col in [i for i in df.columns if i in ['prompt', 'response_a', 'response_b']]:
    df[col] = df[col].map(lambda x: eval(x.replace("null", "''"))[0])
    df[col] = df[col].apply(lambda x: str(x).encode('utf-8', errors='ignore').decode('utf-8'))
    # Lower Text
    df[col] = df[col].str.lower()
    # Remove Punctuation
    df[col] = df[col].str.replace(r'[^\w\s]', '', regex=True)
    # Remove Stopwords
    df[col] = df[col].apply(remove_stopwords)

# Show Sample
df.head()

In [None]:
# Create label dictionary for mapping
label_map = {'winner_model_a': 0, 'winner_model_b': 1, 'winner_tie': 2}

# Map column into label
df['label'] = df[['winner_model_a', 'winner_model_b', 'winner_tie']].idxmax(axis=1).map(label_map)

In [None]:
# Tokenizer
tokenizer = Tokenizer()
texts = df['prompt'].tolist() + df['response_a'].tolist() + df['response_b'].tolist()
tokenizer.fit_on_texts(texts)

In [None]:
def preprocess_fn(prompt, response_a, response_b, label=None):
    prompt = pad_sequences(tokenizer.texts_to_sequences(prompt), maxlen=512, padding='post', truncating='post')
    response_a = pad_sequences(tokenizer.texts_to_sequences(response_a), maxlen=512, padding='post', truncating='post')
    response_b = pad_sequences(tokenizer.texts_to_sequences(response_b), maxlen=512, padding='post', truncating='post')
    feature = {
        'prompt': prompt,
        'response_a': response_a,
        'response_b': response_b
    }
    return (feature, label) if label is not None else feature

def build_dataset(prompt, response_a, response_b, label=None, batch_size=1, shuffle=True):
    slices = (preprocess_fn(prompt, response_a, response_b, label))
    ds = tf.data.Dataset.from_tensor_slices(slices)
    if shuffle:
        ds = ds.shuffle(buffer_size=len(prompt))
    ds = ds.batch(batch_size=batch_size)
    return ds

In [None]:
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df.label)

train_ds = build_dataset(
    prompt=train_df.prompt.values.tolist(),
    response_a=train_df.response_a.values.tolist(),
    response_b=train_df.response_b.values.tolist(),
    label=train_df.label.values.tolist(),
    batch_size=32,
    shuffle=True
)

val_ds = build_dataset(
    prompt=val_df.prompt.values.tolist(),
    response_a=val_df.response_a.values.tolist(),
    response_b=val_df.response_b.values.tolist(),
    label=val_df.label.values.tolist(),
    batch_size=32,
    shuffle=False
)

In [None]:
# Inspect the dataset
for feature, label in train_ds:
    print(f"Feature 1: {feature['prompt'].numpy().shape}, Feature 2: {feature['response_a'].numpy().shape}, Feature 3: {feature['response_b'].numpy().shape}, Label: {label.numpy().shape}")
    break

In [None]:
# Define input layers with integer-encoded sequences
prompt_input = tf.keras.Input(shape=(512,), name='prompt')
response_a_input = tf.keras.Input(shape=(512,), name='response_a')
response_b_input = tf.keras.Input(shape=(512,), name='response_b')

# Reshape inputs to (batch_size, sequence_length, 1) for LSTM
prompt_reshaped = layers.Reshape((512, 1))(prompt_input)
response_a_reshaped = layers.Reshape((512, 1))(response_a_input)
response_b_reshaped = layers.Reshape((512, 1))(response_b_input)

# Define LSTM layer
bi_lstm = layers.Bidirectional(layers.LSTM(100, return_sequences=True))

# Apply LSTM to reshaped inputs
prompt_encoded = bi_lstm(prompt_reshaped)
response_a_encoded = bi_lstm(response_a_reshaped)
response_b_encoded = bi_lstm(response_b_reshaped)

attention_a = layers.Attention()([prompt_encoded, response_a_encoded])
attention_b = layers.Attention()([prompt_encoded, response_b_encoded])

pool_a = layers.GlobalAveragePooling1D()(attention_a)
pool_b = layers.GlobalAveragePooling1D()(attention_b)

# Concatenate encoded outputs
x = layers.Concatenate()([pool_a, pool_b])
x = layers.Dense(256, activation='relu')(x)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dense(8, activation='relu')(x)

# Output layer
outputs = layers.Dense(3, activation='softmax')(x)

# Build and compile the model
model = tf.keras.Model(inputs=[prompt_input, response_a_input, response_b_input], outputs=outputs)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=["accuracy"],
)
model.summary()

In [None]:
tf.keras.backend.clear_session()

early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = model.fit(train_ds, validation_data=val_ds, epochs=128, callbacks=[early_stopping])

In [None]:
# Plot the training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss over epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Load Test Data
test_df = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')

# Parse String and Ensure all text data is encoded to UTF-8
for col in [i for i in df.columns if i in ['prompt', 'response_a', 'response_b']]:
    test_df[col] = test_df[col].map(lambda x: eval(x.replace("null", "''"))[0])
    test_df[col] = test_df[col].apply(lambda x: str(x).encode('utf-8', errors='ignore').decode('utf-8'))
    # Lower Text
    test_df[col] = test_df[col].str.lower()
    # Remove Punctuation
    test_df[col] = test_df[col].str.replace(r'[^\w\s]', '', regex=True)
    # Remove Stopwords
    test_df[col] = test_df[col].apply(remove_stopwords)

# Show Sample
test_df.head()

In [None]:
test_ds = build_dataset(
    prompt=test_df.prompt.values.tolist(),
    response_a=test_df.response_a.values.tolist(),
    response_b=test_df.response_b.values.tolist(),
    batch_size=32,
    shuffle=False
)

In [None]:
test_pred = model.predict(test_ds)

In [None]:
submission_df = test_df[["id"]].copy()
submission_df[['winner_model_a', 'winner_model_b', 'winner_tie']] = test_pred.tolist()
submission_df.to_csv("submission.csv", index=False)
submission_df.head()