In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
cd drive/MyDrive/Colab Notebooks

[Errno 2] No such file or directory: 'drive/MyDrive/Colab Notebooks'
/content/drive/MyDrive/Colab Notebooks


Import Library

In [26]:
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch import nn, optim
from torch.cuda.amp import autocast, GradScaler
from torch.nn.functional import sigmoid

Train data

In [27]:
# Data loading and preprocessing
train_data = pd.read_csv('./NLU/AV/train.csv')

# Replace NaN values in text columns with an empty string or some other placeholder
train_data['text_1'] = train_data['text_1'].fillna('')
train_data['text_2'] = train_data['text_2'].fillna('')

In [28]:
# Dataset and DataLoader implementation
class AuthorshipDataset(Dataset):
    def __init__(self, texts_a, texts_b, labels, tokenizer, max_len=256):
        self.texts_a = texts_a
        self.texts_b = texts_b
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Correctly access instance variables with self
        encoding_a = self.tokenizer(self.texts_a[idx], max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
        encoding_b = self.tokenizer(self.texts_b[idx], max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")

        # Properly combining tensors and returning them along with the labels
        return {
            'input_ids_a': encoding_a['input_ids'].squeeze(0),
            'attention_mask_a': encoding_a['attention_mask'].squeeze(0),
            'input_ids_b': encoding_b['input_ids'].squeeze(0),
            'attention_mask_b': encoding_b['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

In [29]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_dataset = AuthorshipDataset(train_data['text_1'].tolist(), train_data['text_2'].tolist(), train_data['label'].tolist(), tokenizer)

In [31]:
class SiameseBERT(nn.Module):
    def __init__(self, bert_model, hidden_size=768, output_size=1):
        super().__init__()
        self.bert = bert_model
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, output_size)

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

    def forward(self, input_ids_a, attention_mask_a, input_ids_b, attention_mask_b):
        # Get the output from BERT for both sequences
        output_a = self.bert(input_ids_a, attention_mask=attention_mask_a)
        output_b = self.bert(input_ids_b, attention_mask=attention_mask_b)

        # Perform mean pooling on the outputs
        pooled_output_a = self.mean_pooling(output_a, attention_mask_a)
        pooled_output_b = self.mean_pooling(output_b, attention_mask_b)

        # Pass through the dense layer
        dense_output_a = torch.relu(self.dense(pooled_output_a))
        dense_output_b = torch.relu(self.dense(pooled_output_b))

        # Compute distance metric
        combined_output = torch.abs(dense_output_a - dense_output_b)
        logits = self.output(combined_output)

        return logits

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SiameseBERT(DistilBertModel.from_pretrained('distilbert-base-uncased'))
model.to(device)

SiameseBERT(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_f

In [33]:
# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define the optimizer and GradScaler for mixed precision
optimizer = optim.Adam(model.parameters(), lr=5e-5)
scaler = GradScaler()

In [34]:
def train_epoch(model, data_loader, optimizer, device, scaler):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in data_loader:
        input_ids_a = batch['input_ids_a'].to(device)
        attention_mask_a = batch['attention_mask_a'].to(device)
        input_ids_b = batch['input_ids_b'].to(device)
        attention_mask_b = batch['attention_mask_b'].to(device)
        labels = batch['labels'].to(device).unsqueeze(1)

        optimizer.zero_grad()

        with autocast():
            logits = model(input_ids_a, attention_mask_a, input_ids_b, attention_mask_b)
            loss = nn.BCEWithLogitsLoss()(logits, labels.float())

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
        predictions = torch.sigmoid(logits) > 0.5
        correct_predictions += (predictions == labels).float().sum().item()
        total_predictions += labels.size(0)

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions / total_predictions
    return avg_loss, accuracy

In [35]:
# Run the training loop
epochs = 5
for epoch in range(epochs):
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, device, scaler)
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")

Epoch 1/5, Train Loss: 0.6170, Train Accuracy: 0.6316
Epoch 2/5, Train Loss: 0.4867, Train Accuracy: 0.7604
Epoch 3/5, Train Loss: 0.3172, Train Accuracy: 0.8669
Epoch 4/5, Train Loss: 0.1530, Train Accuracy: 0.9472
Epoch 5/5, Train Loss: 0.0726, Train Accuracy: 0.9768


In [36]:
# Save model
torch.save(model.state_dict(), './NLU/bert.pth')

Development data

In [37]:
dev_data = pd.read_csv('./NLU/AV/dev.csv')
dev_labels = dev_data['label'].values

# Replace NaN values in text columns with an empty string or some other placeholder
dev_data['text_1'] = dev_data['text_1'].fillna('')
dev_data['text_2'] = dev_data['text_2'].fillna('')

In [38]:
class DevDataset(Dataset):
    def __init__(self, texts_a, texts_b, tokenizer, max_len=512):
        self.texts_a = texts_a
        self.texts_b = texts_b
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts_a)

    def __getitem__(self, idx):
        text_a = self.texts_a[idx]
        text_b = self.texts_b[idx]

        encoding_a = tokenizer.encode_plus(
            text_a,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        encoding_b = tokenizer.encode_plus(
            text_b,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids_a': encoding_a['input_ids'].squeeze(0),
            'attention_mask_a': encoding_a['attention_mask'].squeeze(0),
            'input_ids_b': encoding_b['input_ids'].squeeze(0),
            'attention_mask_b': encoding_b['attention_mask'].squeeze(0)
        }

In [39]:
# Create the DataLoader for development data
dev_dataset = DevDataset(dev_data['text_1'].tolist(), dev_data['text_2'].tolist(), tokenizer)
dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=False)

In [40]:
# Function to generate predictions
def generate_predictions(model, data_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids_a = batch['input_ids_a'].to(device)
            attention_mask_a = batch['attention_mask_a'].to(device)
            input_ids_b = batch['input_ids_b'].to(device)
            attention_mask_b = batch['attention_mask_b'].to(device)

            outputs = model(input_ids_a, attention_mask_a, input_ids_b, attention_mask_b)
            preds = sigmoid(outputs).squeeze().cpu().numpy()
            predictions.extend(preds)
    return predictions

In [41]:
# Generate predictions
dev_predictions = generate_predictions(model, dev_loader)
result_df = pd.DataFrame(dev_predictions, columns=['prediction'])

# Apply threshold to convert probabilities to binary output
best_threshold = 0.5
dev_predicted_labels = (result_df['prediction'] > best_threshold).astype(int)

result_df = pd.DataFrame(dev_predicted_labels, columns=['prediction'])
result_df.to_csv("./NLU/bert_result.csv")

In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [43]:
print("Accuracy:", accuracy_score(dev_labels, dev_predicted_labels))
print("Precision:", precision_score(dev_labels, dev_predicted_labels))
print("Recall:", recall_score(dev_labels, dev_predicted_labels))
print("F1 Score:", f1_score(dev_labels, dev_predicted_labels))

Accuracy: 0.7246666666666667
Precision: 0.7163323782234957
Recall: 0.7472600464961807
F1 Score: 0.7314694408322496
