In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
cd drive/MyDrive/Colab Notebooks

[Errno 2] No such file or directory: 'drive/MyDrive/Colab Notebooks'
/content/drive/MyDrive/Colab Notebooks


Import library

In [36]:
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertModel
from torch import nn, optim
from torch.nn.functional import sigmoid

BERT model

In [37]:
# A custom dataset class for a Siamese network class utilizing a BERT model for embedding text pairs
class SiameseBERT(nn.Module):
    # Initialize the Siamese BERT model with a pretrained BERT model and additional layers
    def __init__(self, bert_model, hidden_size=768, output_size=1):
        super().__init__()
        self.bert = bert_model
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, output_size)

    # Applies mean pooling to the last hidden states of BERT outputs using an attention mask
    def mean_pooling(self, model_output, attention_mask):
        # Extract the last hidden states as token embeddings
        token_embeddings = model_output.last_hidden_state

        # Expand the attention mask for element-wise multiplication
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()

        # Sum the embddings while applying the mask
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)

        # Compute sum of the mask with a clamp
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)

        # Return the mean pooled embeddings
        return sum_embeddings / sum_mask

    # Defines the forward pass for the SiameseBERT network with two input sequences
    def forward(self, input_ids_a, attention_mask_a, input_ids_b, attention_mask_b):
        # Get the output from BERT for both sequences
        output_a = self.bert(input_ids_a, attention_mask=attention_mask_a)
        output_b = self.bert(input_ids_b, attention_mask=attention_mask_b)

        # Perform mean pooling on the outputs
        pooled_output_a = self.mean_pooling(output_a, attention_mask_a)
        pooled_output_b = self.mean_pooling(output_b, attention_mask_b)

        # Pass through the dense layer
        dense_output_a = torch.relu(self.dense(pooled_output_a))
        dense_output_b = torch.relu(self.dense(pooled_output_b))

        # Compute distance metric
        combined_output = torch.abs(dense_output_a - dense_output_b)
        logits = self.output(combined_output)

        return logits

In [38]:
# Load model
model = SiameseBERT(DistilBertModel.from_pretrained('distilbert-base-uncased'))
state_dict = torch.load("./NLU/bert.pth")
model.load_state_dict(state_dict)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

SiameseBERT(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_f

Test data

In [39]:
# Load test data
test = pd.read_csv('./NLU/test.csv')

# Replace na values in text columns with an empty string
test['text_1'] = test['text_1'].fillna('')
test['text_2'] = test['text_2'].fillna('')

In [40]:
# A custom dataset class for test data
class TestDataset(Dataset):
    # Initialize the dataset with text pairs and a tokenizer
    def __init__(self, texts_a, texts_b, tokenizer, max_len=512):
        self.texts_a = texts_a
        self.texts_b = texts_b
        self.tokenizer = tokenizer
        self.max_len = max_len

    # Return the total number of items in the dataset
    def __len__(self):
        return len(self.texts_a)

    # Return the encoded pair of texts at the given index
    def __getitem__(self, idx):
        text_a = self.texts_a[idx]
        text_b = self.texts_b[idx]

        encoding_a = tokenizer.encode_plus(
            text_a,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        encoding_b = tokenizer.encode_plus(
            text_b,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids_a': encoding_a['input_ids'].squeeze(0),
            'attention_mask_a': encoding_a['attention_mask'].squeeze(0),
            'input_ids_b': encoding_b['input_ids'].squeeze(0),
            'attention_mask_b': encoding_b['attention_mask'].squeeze(0)
        }

In [41]:
# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

testset = TestDataset(test['text_1'].tolist(), test['text_2'].tolist(), tokenizer)
test_loader = DataLoader(testset, batch_size=32, shuffle=False)

In [42]:
# Generate predictions for the given data loader
def generate_predictions(model, data_loader):
    model.eval()
    predictions = []

    # Deactivate gradients for evaluation
    with torch.no_grad():
        for batch in data_loader:
            input_ids_a = batch['input_ids_a'].to(device)
            attention_mask_a = batch['attention_mask_a'].to(device)
            input_ids_b = batch['input_ids_b'].to(device)
            attention_mask_b = batch['attention_mask_b'].to(device)

            # Apply sigmoid to output logits
            outputs = model(input_ids_a, attention_mask_a, input_ids_b, attention_mask_b)
            preds = sigmoid(outputs).squeeze().cpu().numpy()
            predictions.extend(preds)
    return predictions

Testing model on test data

In [43]:
# Generate predictions for the test dataset
predictions = generate_predictions(model, test_loader)
result_df = pd.DataFrame(predictions, columns=['prediction'])

# Convert probabilities to binary labels using a threshold = 0.5
best_threshold = 0.5
predicted_labels = (result_df['prediction'] > best_threshold).astype(int)

In [44]:
result_df = pd.DataFrame(predicted_labels, columns=['prediction'])
result_df.to_csv("./NLU/Group_21_C.csv", index=False)