In [1]:
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import BertTokenizer, BertModel

from torchmultimodal.modules.encoders.bert_text_encoder import bert_text_encoder

import os
import json
import yaml
from tqdm import tqdm

import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
text_enc = BertModel.from_pretrained("bert-base-uncased")
print(text_enc)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [51]:
# opening the config file and extracting the parameters
with open("cfg.yaml", "r") as stream:
    try:
        config = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

# text
max_len = config["text"]["max_len"]
hidden_size = config["text"]["hidden_size"]

#training
batch_size = config["training"]["batch_size"]
lr = config["training"]["lr"]
epochs = config["training"]["epochs"]

In [None]:
class PredictionModel(nn.Module):
    def __init__(self, text_encoder, output_dim):
        super().__init__()
        self.text_encoder = text_encoder

        self.classifier = nn.Sequential(
            nn.Linear(self.text_encoder.output_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(hidden_size, output_dim)
            nn.Sigmoid()
        )
        self.classifier.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()

    def forward(self, input_ids, attention_mask):
        text_features = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state.mean(dim=1)  # Average pooling over the sequence length
        return self.classifier(text_features)
    
text_encoder = bert_text_encoder()
model = PredictionModel(text_encoder, output_dim=1) # can be changed to the number of classes (types of hate speech)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# Tokenize text
def tokenize_text(text, max_len=512):
    tokens = tokenizer(
        text,
        padding="max_length",  # Pad to max_len
        truncation=True,       # Truncate if longer than max_len
        max_length=max_len,    # Maximum sequence length
        return_tensors="pt",   # Return PyTorch tensors
    )
    return tokens["input_ids"].squeeze(0), tokens["attention_mask"].squeeze(0)

In [54]:
def collate_fn(batch):
    input_ids, attention_mask, labels = zip(*batch)
    
    # Stack tokenized text and attention masks
    input_ids = torch.stack(input_ids)  # Assuming text is already tokenized and of fixed length
    attention_mask = torch.stack(attention_mask)

    # Convert labels to tensor
    labels = torch.tensor(labels)
    
    return input_ids, attention_mask, labels

In [55]:
class AudioTextDataset(Dataset):
    def __init__(self, text_data, labels, train, text_transform=None, max_len=512):
        self.text_data = text_data
        self.labels = labels
        self.text_transform = text_transform
        self.max_len = max_len
        self.train = train

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Tokenize and preprocess text
        text = self.text_data[idx]
        if self.train:
            length = len(text.split())
            divider = np.random.randint(0, length)
            text = " ".join(text.split()[divider:])

        if self.text_transform:
            input_ids, attention_mask = self.text_transform(text, max_len=self.max_len)

        label = self.labels[idx]
        return input_ids, attention_mask, label

In [None]:
text_data = []    # List of text strings
labels = []       # List of labels

# with open('data/data_2.csv', 'r') as f:
#     for line in f:
#         text, _, _, _, label, _ = line.strip().split(',')
#         text_data.append(text)
#         labels.append(int(label))  # Convert label to integer

df = pd.read_csv("data/data_2.csv")
text_data = df["text"]
labels = df["label"]
print(text_data[0], labels[0])

# Instantiate dataset and dataloaders
train_dataset = AudioTextDataset(text_data, labels, text_transform=tokenize_text, max_len=max_len, train=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

val_dataset = AudioTextDataset(text_data, labels, text_transform=tokenize_text, max_len=max_len, train=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

As of March 13th , 2014 , the booklet had been downloaded over 18,300 times and counting . 0


In [57]:
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for input_ids, attention_mask, labels in tqdm(dataloader):
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        # Forward pass
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        # Compute loss
        loss = criterion(outputs, labels)
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for input_ids, attention_mask, labels in tqdm(dataloader):
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask)
            # Compute loss
            loss = criterion(outputs, labels)
            # Accumulate loss
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    return total_loss / len(dataloader), accuracy

In [58]:
# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and loss function
optimizer = Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

# Train and evaluate
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train_loss = train(model, train_loader, criterion, optimizer, device)
    val_loss, val_accuracy = evaluate(model, val_loader, criterion, device)
    print(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

OutOfMemoryError: CUDA out of memory. Tried to allocate 90.00 MiB. GPU 0 has a total capacity of 7.75 GiB of which 22.06 MiB is free. Including non-PyTorch memory, this process has 7.71 GiB memory in use. Of the allocated memory 7.55 GiB is allocated by PyTorch, and 51.42 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)