In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import RobertaTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
from transformers import RobertaTokenizer

# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Now you can get the vocab size
EMBEDDING_DIM = 200
VOCAB_SIZE = len(tokenizer)  # Size of the tokenizer vocabulary
HIDDEN_DIM = 256
NUM_LABELS = 5  # Number of labels (multi-label classification)
BATCH_SIZE = 32





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [4]:


class BiLSTMForClassification(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_labels, dropout=0.3, bidirectional=True, num_layers=1):
        super(BiLSTMForClassification, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=num_layers,
            bidirectional=bidirectional,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, num_labels)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        embedded = self.dropout(embedded)
        packed_embedded = pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        hidden = self.dropout(hidden)
        outputs = self.fc(hidden)
        return outputs  # Raw logits for BCEWithLogitsLoss

# Loss Function
loss_fn = nn.BCEWithLogitsLoss()


# Data Preparation
def encode_labels(label):
    label_dict = {
        '1. threats': 0,
        '2. derogation': 1,
        '3. animosity': 2,
        '4. prejudiced discussions': 3,
        'none': 4
    }
    return label_dict.get(label, None)

def load_and_prepare_data(file_path, tokenizer, max_length=256):
    data = pd.read_csv(file_path)
    data['encoded_label'] = data['label_category'].apply(encode_labels)
    data = data.dropna(subset=['encoded_label'])

    # Convert labels to multi-hot encoding
    num_labels = 5  # Number of classes
    data['multi_hot_labels'] = data['encoded_label'].apply(
        lambda x: [1 if i == x else 0 for i in range(num_labels)]
    )

    # Tokenization
    tokenized = data['text'].apply(
        lambda x: tokenizer.encode_plus(
            x,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
    )
    input_ids = torch.cat([item['input_ids'] for item in tokenized], dim=0)
    attention_masks = torch.cat([item['attention_mask'] for item in tokenized], dim=0)
    labels = torch.tensor(list(data['multi_hot_labels'].values))

    # Sequence lengths (for dynamic padding)
    seq_lengths = torch.tensor([torch.sum(mask).item() for mask in attention_masks])

    # Create a TensorDataset
    dataset = TensorDataset(input_ids, attention_masks, labels, seq_lengths)
    return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Initialize tokenizer from transformers
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
VOCAB_SIZE = len(tokenizer)

# Load Data
train_dataloader = load_and_prepare_data('train.csv', tokenizer)
validation_dataloader = load_and_prepare_data('validate.csv', tokenizer)
test_dataloader = load_and_prepare_data('test.csv', tokenizer)


In [5]:
import torch
# Initialize the model
model = BiLSTMForClassification(
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    vocab_size=VOCAB_SIZE,
    num_labels=NUM_LABELS
)

model.cuda()  # Move the model to GPU if available


# Set the device (use GPU if available, otherwise fallback to CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BiLSTMForClassification(
  (embedding): Embedding(50265, 200)
  (lstm): LSTM(200, 256, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=5, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [6]:
import torch.optim as optim

from torch.optim import Adam
from sklearn.ensemble import RandomForestClassifier

# Assuming model is defined and outputs logits (not probabilities)

# Define the loss function (BCE with logits)
loss_fn = nn.BCEWithLogitsLoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-4)

num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss, total_accuracy = 0, 0

    for batch in train_dataloader:
        # Move batch to GPU
        batch_input_ids, batch_masks, batch_labels, batch_lengths = tuple(t.cuda() for t in batch)

        optimizer.zero_grad()

        # Forward pass
        logits = model(batch_input_ids, batch_lengths)

        # Compute loss
        loss = loss_fn(logits, batch_labels.float())  # Ensure labels are float for BCEWithLogitsLoss
        total_loss += loss.item()

        # Backward pass and optimizer step
        loss.backward()
        optimizer.step()

        # Calculate predictions and accuracy
        probs = torch.sigmoid(logits)
        preds = (probs > 0.5).float()  # Threshold for binary classification
        total_accuracy += (preds.cpu() == batch_labels.cpu()).float().mean()

    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = total_accuracy / len(train_dataloader)
    print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")

    # Validation
    model.eval()
    total_val_loss, total_val_accuracy = 0, 0
    with torch.no_grad():
        for batch in validation_dataloader:
            batch_input_ids, batch_masks, batch_labels, batch_lengths = tuple(t.cuda() for t in batch)

            logits = model(batch_input_ids, batch_lengths)
            loss = loss_fn(logits, batch_labels.float())
            total_val_loss += loss.item()

            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).float()
            total_val_accuracy += (preds.cpu() == batch_labels.cpu()).float().mean()

    avg_val_loss = total_val_loss / len(validation_dataloader)
    val_accuracy = total_val_accuracy / len(validation_dataloader)
    print(f"Epoch {epoch+1} - Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")


# Train Random Forest Classifier
all_train_outputs = []
all_train_labels = []
with torch.no_grad():
    for batch in train_dataloader:
        batch_input_ids, batch_masks, batch_labels, batch_lengths = tuple(t.cuda() for t in batch)

        outputs = model(batch_input_ids, batch_lengths)
        all_train_outputs.extend(outputs.cpu().detach().numpy())
        all_train_labels.extend(batch_labels.cpu().numpy())


# Initialize the Random Forest Classifier
random_forest = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=2, min_samples_leaf=1)

# Now you can call random_forest.fit() as intended
random_forest.fit(all_train_outputs, all_train_labels)


# Evaluate Random Forest on Validation Data
total_val_accuracy = 0
with torch.no_grad():
    for batch in validation_dataloader:
        batch_input_ids, batch_masks, batch_labels, batch_lengths = tuple(t.cuda() for t in batch)

        outputs = model(batch_input_ids, batch_lengths)
        preds = random_forest.predict(outputs.cpu().detach().numpy())
        total_val_accuracy += (preds == batch_labels.cpu().numpy()).mean()

val_accuracy = total_val_accuracy / len(validation_dataloader)
print(f"Random Forest Validation Accuracy: {val_accuracy:.4f}")

Epoch 1 - Train Loss: 0.2956, Train Accuracy: 0.9051
Epoch 1 - Validation Loss: 0.2567, Validation Accuracy: 0.9095
Epoch 2 - Train Loss: 0.2542, Train Accuracy: 0.9098
Epoch 2 - Validation Loss: 0.2541, Validation Accuracy: 0.9081
Epoch 3 - Train Loss: 0.2397, Train Accuracy: 0.9118
Epoch 3 - Validation Loss: 0.2299, Validation Accuracy: 0.9156
Epoch 4 - Train Loss: 0.2260, Train Accuracy: 0.9160
Epoch 4 - Validation Loss: 0.2414, Validation Accuracy: 0.9151
Epoch 5 - Train Loss: 0.2162, Train Accuracy: 0.9188
Epoch 5 - Validation Loss: 0.2295, Validation Accuracy: 0.9166
Random Forest Validation Accuracy: 0.9178


In [7]:
total_val_accuracy += (preds == batch_labels.cpu().numpy()).mean()
print(f" Validation Accuracy: {total_val_accuracy:.4f}")

 Validation Accuracy: 57.8063


RANDOM FOREST


------------------------------------------------------

In [16]:
import torch
import torch.optim as optim
import torch.nn as nn
from sklearn.tree import DecisionTreeClassifier
import numpy as np

num_epochs = 7

criterion = nn.BCEWithLogitsLoss()

for epoch in range(num_epochs):
    model.train()
    total_loss, total_accuracy = 0, 0

    for batch in train_dataloader:
        input_ids, attention_masks, labels, seq_lengths = [t.to(device) for t in batch]

        optimizer.zero_grad()

        outputs = model(input_ids, seq_lengths)

        loss = criterion(outputs, labels.float())
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        probs = torch.sigmoid(outputs)
        preds = (probs > 0.5).float()  # Threshold at 0.5
        total_accuracy += (preds.cpu() == labels.cpu()).float().mean()

    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = total_accuracy / len(train_dataloader)
    print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")

    # Validation
    model.eval()
    total_val_loss, total_val_accuracy = 0, 0
    with torch.no_grad():
        for batch in validation_dataloader:
            input_ids, attention_masks, labels, seq_lengths = [t.to(device) for t in batch]

            outputs = model(input_ids, seq_lengths)
            val_loss = criterion(outputs, labels.float())
            total_val_loss += val_loss.item()

            probs = torch.sigmoid(outputs)
            preds = (probs > 0.5).float()
            total_val_accuracy += (preds.cpu() == labels.cpu()).float().mean()

    avg_val_loss = total_val_loss / len(validation_dataloader)
    val_accuracy = total_val_accuracy / len(validation_dataloader)
    print(f"Epoch {epoch+1} - Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")


Epoch 1 - Train Loss: 0.2063, Train Accuracy: 0.9214
Epoch 1 - Validation Loss: 0.2222, Validation Accuracy: 0.9210
Epoch 2 - Train Loss: 0.1977, Train Accuracy: 0.9252
Epoch 2 - Validation Loss: 0.2249, Validation Accuracy: 0.9213
Epoch 3 - Train Loss: 0.1893, Train Accuracy: 0.9262
Epoch 3 - Validation Loss: 0.2353, Validation Accuracy: 0.9151
Epoch 4 - Train Loss: 0.1807, Train Accuracy: 0.9301
Epoch 4 - Validation Loss: 0.2252, Validation Accuracy: 0.9230
Epoch 5 - Train Loss: 0.1727, Train Accuracy: 0.9334
Epoch 5 - Validation Loss: 0.2291, Validation Accuracy: 0.9242
Epoch 6 - Train Loss: 0.1640, Train Accuracy: 0.9351
Epoch 6 - Validation Loss: 0.2480, Validation Accuracy: 0.9245
Epoch 7 - Train Loss: 0.1574, Train Accuracy: 0.9375
Epoch 7 - Validation Loss: 0.2419, Validation Accuracy: 0.9236


In [17]:
# Evaluate on Test Set
model.eval()
total_test_accuracy = 0
with torch.no_grad():
    for batch in test_dataloader:
        batch_input_ids, batch_masks, batch_labels, batch_lengths = tuple(t.cuda() for t in batch)

        outputs = model(batch_input_ids, batch_lengths)
        preds = random_forest.predict(outputs.cpu().detach().numpy())
        total_test_accuracy += (preds == batch_labels.cpu().numpy()).mean()

test_accuracy = total_test_accuracy / len(test_dataloader)
print(f"Test Accuracy: {test_accuracy:.4f}")


Test Accuracy: 0.9178


In [None]:
import torch
import numpy as np

def predict_comment_classification(comment, model, tokenizer, device):
    model.eval()  # Set model to evaluation mode

    # Tokenize the comment
    encoded = tokenizer.encode_plus(
        comment,
        add_special_tokens=True,
        max_length=256,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)
    seq_length = torch.tensor([torch.sum(attention_mask).item()]).to(device)

    with torch.no_grad():
        logits = model(input_ids, seq_length)  # Get raw logits from BiLSTM model
        probabilities = torch.sigmoid(logits).cpu().numpy().flatten()  # Convert to probabilities

    label_dict = {
        0: '1. threats',
        1: '2. derogation',
        2: '3. animosity',
        3: '4. prejudiced discussions',
        4: 'none'
    }

    # Map probabilities to labels
    result = {label_dict[i]: prob for i, prob in enumerate(probabilities)}

    return result

# Example usage
comment = "Lol - a story from another culture. 2000 years, all Eves destroy all edens. 4000 years, (epic of Gilgamesh) all women belittle men."
probabilities = predict_comment_classification(comment, model, tokenizer, device)

print("Classification Probabilities:")
for label, prob in probabilities.items():
    print(f"{label}: {prob:.4f}")


Classification Probabilities:
1. threats: 0.0006
2. derogation: 0.1610
3. animosity: 0.1291
4. prejudiced discussions: 0.0312
none: 0.6237
