Imports and Hyperparameters

In [1]:
import io
import json
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')

import tensorflow as tf
from keras.models import Model
from keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Input, Concatenate, Layer
import tensorflow.keras.backend as K
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import accuracy_score, f1_score, precision_score,  recall_score, matthews_corrcoef

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
# Hyperparameters
MAX_VOCAB_SIZE = 10000
EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = 100
EPOCHS = 10
BATCH_SIZE = 16

# Set a seed to decrease randomness
tf.random.set_seed(42)

# Set the glove file to refer to
MODEL_NAME = 'glove.6B.100d'
GLOVE_FILE = MODEL_NAME + '.txt'
TOKENIZER_NAME = 'tokenizer_bilstm_' + MODEL_NAME + '.json'
WEIGHTS_FILE = 'bilstm.'+ MODEL_NAME +'.weights.h5'

Tokenizers

In [3]:
# Clean inputs
def clean_input(document, char_filter = r"[^\w]"):
    """
    param document: original document
    char_filter: regex specifying characters that need to be removed

    return: cleaned document
    """

    cleaned = []

    # Goes through each sentence in the document
    for sentence in document:

        # convert all words to their lower case equivalent
        sentence = sentence.lower()

        # tokenise
        words = word_tokenize(sentence)

        # join back words to get whole document
        sentence = " ".join(words)

        # replace unwanted characters as specified by char_filter (default: non-word characters) with whitespace
        sentence = re.sub(char_filter, " ", sentence)

        # replace multiple whitespaces with single whitespace
        sentence = re.sub(r"\s+", " ", sentence)

        # strip whitespace from document
        sentence = sentence.strip()

        # append the cleaned sentence to the new list
        cleaned.append(sentence)

    return cleaned

In [4]:
# Load CSV
train = pd.read_csv('train.csv')

# Training set
premise_train = clean_input(train['premise'])
hypothesis_train = clean_input(train['hypothesis'])

# Initialize Tokenizer
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")

# Combine list of premises and hypothesis in training set and fit tokenizer on that
combined = premise_train + hypothesis_train
tokenizer.fit_on_texts(combined)

Embeddings

In [5]:
embeddings_dictionary = dict()
glove_file = open(GLOVE_FILE, encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

vocab_length = len(tokenizer.word_index) + 1

# Create Embedding Matrix having n columns
# Containing n-dimensional GloVe word embeddings for all words in our corpus.
embedding_matrix = np.zeros((vocab_length, EMBEDDING_DIM))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

Model Architecture

In [6]:
# Defining an attention class
class Attention(Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1), initializer="normal", trainable=True)
        self.b = self.add_weight(name="att_bias", shape=(1,), initializer="zeros", trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, inputs):
        e = K.tanh(K.dot(inputs, self.W) + self.b)  # Compute attention scores
        a = K.softmax(e, axis=1)  # Softmax over time axis
        output = inputs * a  # Apply attention weights
        return K.sum(output, axis=1)  # Weighted sum over time axis

In [7]:
# Define Input Layers for Premise and Hypothesis
premise_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32", name="premise_input")
hypothesis_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32", name="hypothesis_input")

# Shared Embedding Layer
embedding_layer = Embedding(input_dim=vocab_length, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH,
                            weights=[embedding_matrix], trainable=False)

# Encode Premise and Hypothesis
premise_embedding = embedding_layer(premise_input)
hypothesis_embedding = embedding_layer(hypothesis_input)

# BiLSTM layers
bilstm_layer = Bidirectional(LSTM(64, return_sequences=True))
premise_encoded = bilstm_layer(premise_embedding)
hypothesis_encoded = bilstm_layer(hypothesis_embedding)

# Attention layer
attention = Attention()
premise_attention = attention(premise_encoded)
hypothesis_attention = attention(hypothesis_encoded)

# Merge Representations (Use last hidden state)
merged = Concatenate()([premise_attention, hypothesis_attention])

# Fully Connected Layers
dense = Dense(64, activation="relu")(merged)
dropout = Dropout(0.2)(dense)
output = Dense(1, activation="sigmoid")(dropout)  # Sigmoid for binary classification

# Define and Compile Model
model = Model(inputs=[premise_input, hypothesis_input], outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Model Summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 premise_input (InputLayer)  [(None, 100)]                0         []                            
                                                                                                  
 hypothesis_input (InputLay  [(None, 100)]                0         []                            
 er)                                                                                              
                                                                                                  
 embedding (Embedding)       (None, 100, 100)             3307000   ['premise_input[0][0]',       
                                                                     'hypothesis_input[0][0]']    
                                                                                              

Load Test Dataset

In [8]:
# Load dataset
file_path = "dev.csv"
test = pd.read_csv(file_path)

# Clean inputs
premise_test = clean_input(test['premise'])
hypothesis_test = clean_input(test['hypothesis'])

# Load trained model weights
model.load_weights(WEIGHTS_FILE)

# Open tokenizer file from json
with open(TOKENIZER_NAME) as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

    # Convert text to sequences
    premise_sequence_test = tokenizer.texts_to_sequences(premise_test)
    hypothesis_sequence_test = tokenizer.texts_to_sequences(hypothesis_test)

    # Pad sequences
    premise_padded_test = pad_sequences(premise_sequence_test, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
    hypothesis_padded_test = pad_sequences(hypothesis_sequence_test, maxlen=MAX_SEQUENCE_LENGTH, padding="post")

    # Make predictions
    predictions = model.predict([premise_padded_test, hypothesis_padded_test])
    predicted_labels = (predictions > 0.5).astype(int)  # Convert probabilities to binary labels

    # Save predictions to CSV
    df_predictions = pd.DataFrame(predicted_labels, columns=['prediction'])
    df_predictions.to_csv("Group_70_B.csv", index=False)





Evaluation

In [9]:
# Evaluate model performance
accuracy = accuracy_score(test["label"], predicted_labels)
f1 = f1_score(test["label"], predicted_labels, average="weighted")
precision = precision_score(test["label"], predicted_labels)
recall = recall_score(test["label"], predicted_labels, average="weighted")
mcc = matthews_corrcoef(test["label"], predicted_labels)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"MCC: {mcc:.4f}")

Accuracy: 0.7026
F1 Score: 0.7019
Precision: 0.6979
Recall: 0.7026
MCC: 0.4043
