Imports and Hyperparameters

In [2]:
import io
import json
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')

import tensorflow as tf
from keras.models import Model
from keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Input, Concatenate, Layer
import tensorflow.keras.backend as K
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import accuracy_score, f1_score, recall_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
# Hyperparameters
MAX_VOCAB_SIZE = 10000
EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = 100
EPOCHS = 10
BATCH_SIZE = 16

# Set a seed to decrease randomness
tf.random.set_seed(42)

# Set the glove file to refer to
MODEL_NAME = 'glove.6B.100d'
GLOVE_FILE = MODEL_NAME + '.txt'
TOKENIZER_NAME = 'tokenizer_bilstm_' + MODEL_NAME + '.json'
WEIGHTS_FILE = 'bilstm.'+ MODEL_NAME +'.weights.h5'

Load CSV files and preprocess

In [4]:
# Load CSV
train = pd.read_csv('train.csv')
valid = pd.read_csv('dev.csv')

In [5]:
# Clean inputs
def clean_input(document, char_filter = r"[^\w]"):
    """
    param document: original document
    char_filter: regex specifying characters that need to be removed

    return: cleaned document
    """

    cleaned = []

    # Goes through each sentence in the document
    for sentence in document:

        # convert all words to their lower case equivalent
        sentence = sentence.lower()

        # tokenise
        words = word_tokenize(sentence)

        # join back words to get whole document
        sentence = " ".join(words)

        # replace unwanted characters as specified by char_filter (default: non-word characters) with whitespace
        sentence = re.sub(char_filter, " ", sentence)

        # replace multiple whitespaces with single whitespace
        sentence = re.sub(r"\s+", " ", sentence)

        # strip whitespace from document
        sentence = sentence.strip()

        # append the cleaned sentence to the new list
        cleaned.append(sentence)

    return cleaned

In [6]:
# Identify subsets

# Training set
premise_train = clean_input(train['premise'])
hypothesis_train = clean_input(train['hypothesis'])
label_train = train['label']

# Validation set
premise_valid = clean_input(valid['premise'])
hypothesis_valid = clean_input(valid['hypothesis'])
label_valid = valid['label']

In [7]:
# Initialize Tokenizer
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")

# Combine list of premises and hypothesis in training set and fit tokenizer on that
combined = premise_train + hypothesis_train
tokenizer.fit_on_texts(combined)

# Save Tokenizer
tokenizer_json = tokenizer.to_json()
with io.open(TOKENIZER_NAME, 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

# Convert text to sequences
premise_sequence_train = tokenizer.texts_to_sequences(premise_train)
hypothesis_sequence_train = tokenizer.texts_to_sequences(hypothesis_train)
premise_sequence_valid = tokenizer.texts_to_sequences(premise_valid)
hypothesis_sequence_valid = tokenizer.texts_to_sequences(hypothesis_valid)

In [8]:
# Pad sequences
premise_padded_train = pad_sequences(premise_sequence_train, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
hypothesis_padded_train = pad_sequences(hypothesis_sequence_train, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
premise_padded_valid = pad_sequences(premise_sequence_valid, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
hypothesis_padded_valid = pad_sequences(hypothesis_sequence_valid, maxlen=MAX_SEQUENCE_LENGTH, padding="post")

In [9]:
embeddings_dictionary = dict()
glove_file = open(GLOVE_FILE, encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

vocab_length = len(tokenizer.word_index) + 1

# Create Embedding Matrix having n columns
# Containing n-dimensional GloVe word embeddings for all words in our corpus.
embedding_matrix = np.zeros((vocab_length, EMBEDDING_DIM))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

Model

In [10]:
# Defining an attention class
class Attention(Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1), initializer="normal", trainable=True)
        self.b = self.add_weight(name="att_bias", shape=(1,), initializer="zeros", trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, inputs):
        e = K.tanh(K.dot(inputs, self.W) + self.b)  # Compute attention scores
        a = K.softmax(e, axis=1)  # Softmax over time axis
        output = inputs * a  # Apply attention weights
        return K.sum(output, axis=1)  # Weighted sum over time axis

In [11]:
# Define Input Layers for Premise and Hypothesis
premise_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32", name="premise_input")
hypothesis_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32", name="hypothesis_input")

# Shared Embedding Layer
embedding_layer = Embedding(input_dim=vocab_length, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH,
                            weights=[embedding_matrix], trainable=False)

# Encode Premise and Hypothesis
premise_embedding = embedding_layer(premise_input)
hypothesis_embedding = embedding_layer(hypothesis_input)

# BiLSTM layers
bilstm_layer = Bidirectional(LSTM(64, return_sequences=True))
premise_encoded = bilstm_layer(premise_embedding)
hypothesis_encoded = bilstm_layer(hypothesis_embedding)

# Attention layer
attention = Attention()
premise_attention = attention(premise_encoded)
hypothesis_attention = attention(hypothesis_encoded)

# Merge Representations (Use last hidden state)
merged = Concatenate()([premise_attention, hypothesis_attention])

# Fully Connected Layers
dense = Dense(64, activation="relu")(merged)
dropout = Dropout(0.2)(dense)
output = Dense(1, activation="sigmoid")(dropout)  # Sigmoid for binary classification

# Define and Compile Model
model = Model(inputs=[premise_input, hypothesis_input], outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Model Summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 premise_input (InputLayer)  [(None, 100)]                0         []                            
                                                                                                  
 hypothesis_input (InputLay  [(None, 100)]                0         []                            
 er)                                                                                              
                                                                                                  
 embedding (Embedding)       (None, 100, 100)             3307000   ['premise_input[0][0]',       
                                                                     'hypothesis_input[0][0]']    
                                                                                              

Train the Model

In [12]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3)

checkpoint_filepath = WEIGHTS_FILE

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True,
    save_weights_only=True)

In [13]:
history = model.fit(
    [premise_padded_train, hypothesis_padded_train],
    label_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1,
    validation_data=([premise_padded_valid, hypothesis_padded_valid], label_valid),
    shuffle=True,
    callbacks=[early_stopping, model_checkpoint_callback]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


Sample Prediction

In [14]:
# New Premise & Hypothesis
new_premise = ["A child is playing soccer"]
new_hypothesis = ["An adult is not playing football"]  # Likely entailment

# Get model
model.load_weights(WEIGHTS_FILE)

# Open tokenizer file from json
with open(TOKENIZER_NAME) as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

    # Convert to sequences
    new_premise_seq = tokenizer.texts_to_sequences(new_premise)
    new_hypothesis_seq = tokenizer.texts_to_sequences(new_hypothesis)

    # Pad sequences
    new_premise_padded = pad_sequences(new_premise_seq, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
    new_hypothesis_padded = pad_sequences(new_hypothesis_seq, maxlen=MAX_SEQUENCE_LENGTH, padding="post")

    # Get Prediction
    prediction = model.predict([new_premise_padded, new_hypothesis_padded])
    print(prediction)

    # Get Prediction Class
    predicted_class = int(prediction[0] > 0.5)  # Convert probability to 0 or 1
    print("Predicted Class:", predicted_class)  # 0 = Contradiction, 1 = Entailment

[[0.03178217]]
Predicted Class: 0


Evaluation by predicting validation set

In [None]:
# Load dataset
file_path = "dev.csv"
df = pd.read_csv(file_path)

# Load trained model
model.load_weights(WEIGHTS_FILE)

# Open tokenizer file from json
with open(TOKENIZER_NAME) as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

    # Convert text to sequences
    premise_sequences = tokenizer.texts_to_sequences(df["premise"].tolist())
    hypothesis_sequences = tokenizer.texts_to_sequences(df["hypothesis"].tolist())

    # Pad sequences
    premise_padded = pad_sequences(premise_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
    hypothesis_padded = pad_sequences(hypothesis_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post")

    # Make predictions
    predictions = model.predict([premise_padded_valid, hypothesis_padded_valid])
    predicted_labels = (predictions > 0.5).astype(int)  # Convert probabilities to binary labels

    # Save predictions to CSV
    df_predictions = pd.DataFrame(predicted_labels, columns=['prediction'])
    df_predictions.to_csv("Group_70_B.csv", index=False)



In [16]:
# Evaluate model performance
accuracy = accuracy_score(df["label"], predicted_labels)
f1 = f1_score(df["label"], predicted_labels, average="weighted")
recall = recall_score(df["label"], predicted_labels, average="weighted")

# Entailment vs Contradiction Ratio
entailment_count = (predicted_labels == 1).sum()
contradiction_count = (predicted_labels == 0).sum()
ratio = entailment_count / (contradiction_count + 1e-6)  # Avoid division by zero

# Print evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Entailment/Contradiction Ratio: {ratio:.4f}")

Accuracy: 0.7026
F1 Score: 0.7019
Recall: 0.7026
Entailment/Contradiction Ratio: 1.2386
