In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence
from torch.utils.data import TensorDataset, DataLoader
from transformers import RobertaTokenizer
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from torch.optim.lr_scheduler import StepLR
import torch.optim as optim
from sklearn.metrics import accuracy_score

In [None]:
# Hyperparameters
EMBEDDING_DIM = 128  # or as per your requirement
HIDDEN_DIM = 256
NUM_LABELS = 5  # Number of labels (multi-label classification)
BATCH_SIZE = 32

In [None]:
# Define BiLSTM Model for Classification
class BiLSTMForClassification(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_labels, dropout=0.5, bidirectional=True, num_layers=2):
        super(BiLSTMForClassification, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.fc1 = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout)
        self.fc2 = nn.Linear(128, num_labels)
        self.bn2 = nn.BatchNorm1d(num_labels)
        self.sigmoid = nn.Sigmoid()  # Use sigmoid instead of softmax for multi-label classification

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        embedded = self.dropout1(embedded)
        packed_embedded = pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)

        hidden = self.fc1(hidden)
        hidden = self.bn1(hidden)
        hidden = self.relu(hidden)

        outputs = self.fc2(hidden)
        outputs = self.bn2(outputs)
        outputs = self.sigmoid(outputs)  # Apply sigmoid to the final output
        return outputs

# Data Preparation
def encode_labels(label):
    label_dict = {
        '1.1 threats of harm': 1,
        '1.2 incitement and encouragement of harm': 2,
        '2.1 descriptive attacks': 3,
        '2.2 aggressive and emotive attacks': 4,
        '2.3 dehumanising attacks & overt sexual objectification': 5,
        '3.1 casual use of gendered slurs, profanities, and insults': 6,
        '3.2 immutable gender differences and gender stereotypes': 7,
        '3.3 backhanded gendered compliments': 8,
        '3.4 condescending explanations or unwelcome advice': 9,
        '4.1 supporting mistreatment of individual women': 10,
        '4.2 supporting systemic discrimination against women as a group': 11,
        'none': 0
    }
    return label_dict.get(label, None)

def load_and_prepare_data(file_path, tokenizer, max_length=256, batch_size=32):
    data = pd.read_csv(file_path)
    data['encoded_label'] = data['label_vector'].apply(encode_labels)
    data = data.dropna(subset=['encoded_label'])
    data['encoded_label'] = data['encoded_label'].astype(int)

    # Tokenization
    tokenized = data['text'].apply(lambda x: tokenizer.encode_plus(x, add_special_tokens=True, max_length=max_length, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt'))
    input_ids = torch.cat([item['input_ids'] for item in tokenized], dim=0)
    attention_masks = torch.cat([item['attention_mask'] for item in tokenized], dim=0)

    # Multi-hot encoding of labels
    labels = torch.zeros((len(data), 12), dtype=torch.float)
    for i, label in enumerate(data['encoded_label']):
        if label != 0:
            labels[i, label - 1] = 1  # Label is offset by 1 because 'none' is represented by 0

    # Sequence lengths (for dynamic padding)
    seq_lengths = torch.tensor([torch.sum(mask).item() for mask in attention_masks])

    # Create a TensorDataset
    dataset = TensorDataset(input_ids, attention_masks, labels, seq_lengths)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize tokenizer from transformers
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
VOCAB_SIZE = len(tokenizer)

# Load Data
train_dataloader = load_and_prepare_data('train.csv', tokenizer)
validation_dataloader = load_and_prepare_data('validate.csv', tokenizer)
test_dataloader = load_and_prepare_data('test.csv', tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
# Initialize Model, Optimizer, Loss Function, and Scheduler
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTMForClassification(embedding_dim=768, hidden_dim=128, vocab_size=VOCAB_SIZE, num_labels=12).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005)
scheduler = StepLR(optimizer, step_size=1, gamma=0.1)
loss_fn = nn.BCEWithLogitsLoss()  # Use BCE with logits since sigmoid is used in the model



In [None]:

# Training Loop
num_epochs = 7
for epoch in range(num_epochs):
    model.train()
    total_loss, total_accuracy = 0, 0

    for batch in train_dataloader:
        batch_input_ids, batch_masks, batch_labels, batch_lengths = tuple(t.to(device) for t in batch)

        model.zero_grad()

        outputs = model(batch_input_ids, batch_lengths)
        loss = loss_fn(outputs, batch_labels)  # BCE with logits is used here
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        # Calculate accuracy
        preds = torch.round(outputs).detach().cpu().numpy()  # Detach before converting to numpy
        total_accuracy += (preds == batch_labels.cpu().numpy()).mean()

    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = total_accuracy / len(train_dataloader)
    print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")

    # Validation
    model.eval()
    total_val_loss, total_val_accuracy = 0, 0
    with torch.no_grad():
        for batch in validation_dataloader:
            batch_input_ids, batch_masks, batch_labels, batch_lengths = tuple(t.to(device) for t in batch)

            outputs = model(batch_input_ids, batch_lengths)
            loss = loss_fn(outputs, batch_labels)  # BCE with logits
            total_val_loss += loss.item()

            preds = torch.round(outputs).detach().cpu().numpy()  # Detach before converting to numpy
            total_val_accuracy += (preds == batch_labels.cpu().numpy()).mean()

    avg_val_loss = total_val_loss / len(validation_dataloader)
    val_accuracy = total_val_accuracy / len(validation_dataloader)
    print(f"Epoch {epoch+1} - Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")


Epoch 1 - Train Loss: 0.9409, Train Accuracy: 0.7284
Epoch 1 - Validation Loss: 1.0220, Validation Accuracy: 0.6141
Epoch 2 - Train Loss: 0.9044, Train Accuracy: 0.8519
Epoch 2 - Validation Loss: 0.8599, Validation Accuracy: 0.9321
Epoch 3 - Train Loss: 0.8739, Train Accuracy: 0.8847
Epoch 3 - Validation Loss: 0.8653, Validation Accuracy: 0.8915
Epoch 4 - Train Loss: 0.8483, Train Accuracy: 0.9083
Epoch 4 - Validation Loss: 0.8499, Validation Accuracy: 0.8861
Epoch 5 - Train Loss: 0.8272, Train Accuracy: 0.9190
Epoch 5 - Validation Loss: 0.8368, Validation Accuracy: 0.8839
Epoch 6 - Train Loss: 0.8108, Train Accuracy: 0.9225
Epoch 6 - Validation Loss: 0.8019, Validation Accuracy: 0.9337
Epoch 7 - Train Loss: 0.7959, Train Accuracy: 0.9281
Epoch 7 - Validation Loss: 0.7902, Validation Accuracy: 0.9314


In [None]:
all_train_outputs = []
all_train_labels = []
with torch.no_grad():
    for batch in train_dataloader:
        batch_input_ids, batch_masks, batch_labels, batch_lengths = tuple(t.cuda() for t in batch)
        outputs = model(batch_input_ids, batch_lengths)
        all_train_outputs.extend(outputs.cpu().detach().numpy())
        all_train_labels.extend(batch_labels.cpu().numpy())


In [None]:
from sklearn.tree import DecisionTreeClassifier  # Import DecisionTree

# Initialize the Decision Tree Classifier with manually specified hyperparameters
decision_tree = DecisionTreeClassifier(max_depth=10, min_samples_split=2, min_samples_leaf=1)

# Train the Decision Tree classifier on the entire training data
decision_tree.fit(all_train_outputs, all_train_labels)

# Calculate Train Accuracy
total_train_accuracy = 0
with torch.no_grad():
    for batch in train_dataloader:
        batch_input_ids, batch_masks, batch_labels, batch_lengths = tuple(t.cuda() for t in batch)

        # Get outputs from BiLSTM model
        outputs = model(batch_input_ids, batch_lengths)

        # Predict using the Decision Tree classifier
        preds = decision_tree.predict(outputs.cpu().detach().numpy())

        # Calculate accuracy
        total_train_accuracy += (preds == batch_labels.cpu().numpy()).mean()

train_accuracy = total_train_accuracy / len(train_dataloader)
print(f"Decision Tree Training Accuracy: {train_accuracy:.4f}")

# Evaluate on Validation Set
total_val_accuracy = 0
with torch.no_grad():
    for batch in validation_dataloader:
        batch_input_ids, batch_masks, batch_labels, batch_lengths = tuple(t.cuda() for t in batch)

        # Get outputs from BiLSTM model
        outputs = model(batch_input_ids, batch_lengths)

        # Predict using the Decision Tree classifier
        preds = decision_tree.predict(outputs.cpu().detach().numpy())

        # Calculate accuracy
        total_val_accuracy += (preds == batch_labels.cpu().numpy()).mean()

val_accuracy = total_val_accuracy / len(validation_dataloader)
print(f"Decision Tree Validation Accuracy: {val_accuracy:.4f}")

# Evaluate on Test Set
total_test_accuracy = 0
with torch.no_grad():
    for batch in test_dataloader:
        batch_input_ids, batch_masks, batch_labels, batch_lengths = tuple(t.cuda() for t in batch)

        # Get outputs from BiLSTM model
        outputs = model(batch_input_ids, batch_lengths)

        # Predict using the Decision Tree classifier
        preds = decision_tree.predict(outputs.cpu().detach().numpy())

        # Calculate accuracy
        total_test_accuracy += (preds == batch_labels.cpu().numpy()).mean()

test_accuracy = total_test_accuracy / len(test_dataloader)
print(f"Decision Tree Test Accuracy: {test_accuracy:.4f}")


Decision Tree Training Accuracy: 0.9803
Decision Tree Validation Accuracy: 0.9788
Decision Tree Test Accuracy: 0.9791


In [None]:
from sklearn.ensemble import RandomForestClassifier  # Import RandomForest

# Initialize the Random Forest Classifier with manually specified hyperparameters
random_forest = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=2, min_samples_leaf=1)

# Train the Random Forest classifier on the entire training data
random_forest.fit(all_train_outputs, all_train_labels)

# Calculate Train Accuracy
total_train_accuracy = 0
with torch.no_grad():
    for batch in train_dataloader:
        batch_input_ids, batch_masks, batch_labels, batch_lengths = tuple(t.cuda() for t in batch)

        # Get outputs from BiLSTM model
        outputs = model(batch_input_ids, batch_lengths)

        # Predict using the Random Forest classifier
        preds = random_forest.predict(outputs.cpu().detach().numpy())

        # Calculate accuracy
        total_train_accuracy += (preds == batch_labels.cpu().numpy()).mean()

train_accuracy = total_train_accuracy / len(train_dataloader)
print(f"Random Forest Training Accuracy: {train_accuracy:.4f}")

# Evaluate on Validation Set
total_val_accuracy = 0
with torch.no_grad():
    for batch in validation_dataloader:
        batch_input_ids, batch_masks, batch_labels, batch_lengths = tuple(t.cuda() for t in batch)

        # Get outputs from BiLSTM model
        outputs = model(batch_input_ids, batch_lengths)

        # Predict using the Random Forest classifier
        preds = random_forest.predict(outputs.cpu().detach().numpy())

        # Calculate accuracy
        total_val_accuracy += (preds == batch_labels.cpu().numpy()).mean()

val_accuracy = total_val_accuracy / len(validation_dataloader)
print(f"Random Forest Validation Accuracy: {val_accuracy:.4f}")

# Evaluate on Test Set
total_test_accuracy = 0
with torch.no_grad():
    for batch in test_dataloader:
        batch_input_ids, batch_masks, batch_labels, batch_lengths = tuple(t.cuda() for t in batch)

        # Get outputs from BiLSTM model
        outputs = model(batch_input_ids, batch_lengths)

        # Predict using the Random Forest classifier
        preds = random_forest.predict(outputs.cpu().detach().numpy())

        # Calculate accuracy
        total_test_accuracy += (preds == batch_labels.cpu().numpy()).mean()

test_accuracy = total_test_accuracy / len(test_dataloader)
print(f"Random Forest Test Accuracy: {test_accuracy:.4f}")


Random Forest Training Accuracy: 0.9798
Random Forest Validation Accuracy: 0.9799
Random Forest Test Accuracy: 0.9798


In [None]:

# Initialize the Random Forest Classifier with manually specified hyperparameters
random_forest = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=2, min_samples_leaf=1)

# Train the Random Forest classifier on the entire training data
random_forest.fit(all_train_outputs, all_train_labels)

# Evaluate on validation data
total_val_accuracy = 0
with torch.no_grad():
    for batch in validation_dataloader:
        batch_input_ids, batch_masks, batch_labels, batch_lengths = tuple(t.cuda() for t in batch)
        outputs = model(batch_input_ids, batch_lengths)
        preds = random_forest.predict(outputs.cpu().detach().numpy())
        total_val_accuracy += (preds == batch_labels.cpu().numpy()).mean()

val_accuracy = total_val_accuracy / len(validation_dataloader)
print(f"Random Forest Validation Accuracy: {val_accuracy:.4f}")

Random Forest Validation Accuracy: 0.9799


In [None]:
# Evaluate on Test Set
model.eval()
total_test_accuracy = 0
with torch.no_grad():
    for batch in test_dataloader:
        batch_input_ids, batch_masks, batch_labels, batch_lengths = tuple(t.cuda() for t in batch)

        outputs = model(batch_input_ids, batch_lengths)
        preds = random_forest.predict(outputs.cpu().detach().numpy())
        total_test_accuracy += (preds == batch_labels.cpu().numpy()).mean()

test_accuracy = total_test_accuracy / len(test_dataloader)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.9798


In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree Classifier with manually specified hyperparameters
decision_tree = DecisionTreeClassifier(max_depth=10, min_samples_split=2, min_samples_leaf=1)

# Train the Decision Tree classifier on the entire training data
decision_tree.fit(all_train_outputs, all_train_labels)

# Evaluate on validation data
total_val_accuracy = 0
with torch.no_grad():
    for batch in validation_dataloader:
        batch_input_ids, batch_masks, batch_labels, batch_lengths = tuple(t.cuda() for t in batch)

        # Get outputs from BiLSTM model
        outputs = model(batch_input_ids, batch_lengths)

        # Predict using the Decision Tree classifier
        preds = decision_tree.predict(outputs.cpu().detach().numpy())

        # Calculate accuracy
        total_val_accuracy += (preds == batch_labels.cpu().numpy()).mean()

val_accuracy = total_val_accuracy / len(validation_dataloader)
print(f"Decision Tree Validation Accuracy: {val_accuracy:.4f}")

# Evaluate on Test Set
total_test_accuracy = 0
with torch.no_grad():
    for batch in test_dataloader:
        batch_input_ids, batch_masks, batch_labels, batch_lengths = tuple(t.cuda() for t in batch)

        # Get outputs from BiLSTM model
        outputs = model(batch_input_ids, batch_lengths)

        # Predict using the Decision Tree classifier
        preds = decision_tree.predict(outputs.cpu().detach().numpy())

        # Calculate accuracy
        total_test_accuracy += (preds == batch_labels.cpu().numpy()).mean()

test_accuracy = total_test_accuracy / len(test_dataloader)
print(f"Decision Tree Test Accuracy: {test_accuracy:.4f}")


Decision Tree Validation Accuracy: 0.9792
Decision Tree Test Accuracy: 0.9793


ORIGINAL: Training

In [None]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
num_epochs = 7

for epoch in range(num_epochs):
    model.train()
    total_loss, total_accuracy = 0, 0

    for batch in train_dataloader:
        input_ids, attention_masks, labels, seq_lengths = [t.to(device) for t in batch]

        optimizer.zero_grad()
        outputs = model(input_ids, seq_lengths)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        preds = torch.argmax(outputs, dim=1)
        total_accuracy += (preds == labels).cpu().numpy().mean()

    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = total_accuracy / len(train_dataloader)
    print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")

    # Validation
    model.eval()
    total_val_loss, total_val_accuracy = 0, 0
    with torch.no_grad():
        for batch in validation_dataloader:
            input_ids, attention_masks, labels, seq_lengths = [t.to(device) for t in batch]

            outputs = model(input_ids, seq_lengths)
            val_loss = criterion(outputs, labels)
            total_val_loss += val_loss.item()

            val_preds = torch.argmax(outputs, dim=1)
            total_val_accuracy += (val_preds == labels).cpu().numpy().mean()

    avg_val_loss = total_val_loss / len(validation_dataloader)
    val_accuracy = total_val_accuracy / len(validation_dataloader)
    print(f"Epoch {epoch+1} - Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

Epoch 1 - Train Loss: 1.8853, Train Accuracy: 0.7576
Epoch 1 - Validation Loss: 1.8657, Validation Accuracy: 0.7684
Epoch 2 - Train Loss: 1.8807, Train Accuracy: 0.7556
Epoch 2 - Validation Loss: 1.8659, Validation Accuracy: 0.7664
Epoch 3 - Train Loss: 1.8858, Train Accuracy: 0.7466
Epoch 3 - Validation Loss: 1.8537, Validation Accuracy: 0.7768
Epoch 4 - Train Loss: 1.8703, Train Accuracy: 0.7658
Epoch 4 - Validation Loss: 1.8516, Validation Accuracy: 0.7812
Epoch 5 - Train Loss: 1.8630, Train Accuracy: 0.7695
Epoch 5 - Validation Loss: 1.8467, Validation Accuracy: 0.7817
Epoch 6 - Train Loss: 1.8581, Train Accuracy: 0.7710
Epoch 6 - Validation Loss: 1.8500, Validation Accuracy: 0.7763
Epoch 7 - Train Loss: 1.8509, Train Accuracy: 0.7780
Epoch 7 - Validation Loss: 1.8491, Validation Accuracy: 0.7758


ORIGINAL: Evaluation

In [None]:
# Evaluate on Test Set
model.eval()
total_test_accuracy = 0
with torch.no_grad():
    for batch in test_dataloader:
        batch_input_ids, batch_masks, batch_labels, batch_lengths = tuple(t.cuda() for t in batch)

        outputs = model(batch_input_ids, batch_lengths)
        preds = torch.argmax(outputs, dim=1).flatten()
        total_test_accuracy += (preds == batch_labels).cpu().numpy().mean()

test_accuracy = total_test_accuracy / len(test_dataloader)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.7670
