In [30]:
import pandas as pd
import numpy as np
import torch
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
import pickle

from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from tqdm import trange
from ast import literal_eval

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1070 Ti'

In [3]:
df = pd.read_csv(r'Archive\clean_data.csv').drop(columns='Unnamed: 0')
df = df.sample(frac=1).reset_index(drop=True) #shuffle rows


label_cols = list(df.columns[2:])
df['one_hot_labels'] = list(df[label_cols].values)

In [5]:
labels = list(df.one_hot_labels.values)
comments = list(df.comment_text.values)

In [9]:
max_length = 100
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=False)

In [10]:
encodings = tokenizer.batch_encode_plus(comments, max_length=max_length, pad_to_max_length=True, truncation=True)

print('tokenizer outputs: ', encodings.keys())


tokenizer outputs:  dict_keys(['input_ids', 'attention_mask'])


In [11]:
input_ids = encodings['input_ids']
attention_masks = encodings['attention_mask']

In [12]:
# Identifying indices of 'one_hot_labels' entries that only occur once - this will allow us to stratify split our training data later
label_counts = df['one_hot_labels'].astype(str).value_counts()
one_freq = label_counts[label_counts==1].keys()
one_freq_idxs = sorted(list(df[df['one_hot_labels'].astype(str).isin(one_freq)].index), reverse=True)
print('df label indices with only one instance: ', one_freq_idxs)

df label indices with only one instance:  [120946, 72800]


In [13]:
# Gathering single instance inputs to force into the training set after stratified split
one_freq_input_ids = [input_ids.pop(i) for i in one_freq_idxs]
one_freq_attention_masks = [attention_masks.pop(i) for i in one_freq_idxs]
one_freq_labels = [labels.pop(i) for i in one_freq_idxs]

In [14]:
print(len(input_ids))
print(len(labels))
print(len(attention_masks))

157570
157570
157570


In [21]:
# Use train_test_split to split our data into train and validation sets
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = train_test_split(
    input_ids, labels, attention_masks,
    random_state=42, test_size=0.1, stratify=labels
)

# Add one frequency data to train data
train_inputs.extend(one_freq_input_ids)
train_labels.extend(one_freq_labels)
train_masks.extend(one_freq_attention_masks)

# Convert lists of numpy arrays to single numpy arrays
train_inputs = np.array(train_inputs)
train_labels = np.array(train_labels)
train_masks = np.array(train_masks)

validation_inputs = np.array(validation_inputs)
validation_labels = np.array(validation_labels)
validation_masks = np.array(validation_masks)

# Convert numpy arrays to torch tensors
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

In [22]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [23]:
# torch.save(validation_dataloader,'validation_data_loader')
# torch.save(train_dataloader,'train_data_loader')

# Load Model & Set Params

In [28]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_cols))
model.cuda()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [36]:
# setting custom optimization parameters. You may implement a scheduler here as well.
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [39]:
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=True, no_deprecation_warning=True)

# Weighted BCE

In [37]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch

class_weights_per_label = {}
class_weights_tensors = {}

for column in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    y = df[column]
    class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
    class_weights_dict = dict(zip(np.unique(y), class_weights))
    class_weights_per_label[column] = class_weights_dict

    # Convert to tensor and ensure the tensor has the right shape for the loss function
    class_weights_tensor = torch.tensor([class_weights_dict.get(0, 1.0), class_weights_dict.get(1, 1.0)], dtype=torch.float32)
    class_weights_tensors[column] = class_weights_tensor.to(device)


# Train Model
Class Weights Per Label: The class weights are calculated for each label separately, stored as tensors, and later used in the loss function to balance the training process.
Weighted Loss: During training, the BCEWithLogitsLoss function is applied per label with the appropriate class weight for that label. This way, the model learns to handle class imbalance better.

In [40]:
from torch.nn import BCEWithLogitsLoss

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs
epochs = 3

for _ in trange(epochs, desc="Epoch"):

    # Training
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        optimizer.zero_grad()

        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs[0]

        # Compute the weighted loss for each label
        loss = 0
        for i, column in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
            # Get the class weights for the current label
            class_weights = class_weights_tensors[column]
            loss_func = BCEWithLogitsLoss(weight=class_weights[b_labels[:, i].long()])

            # Compute loss for the current label
            label_loss = loss_func(logits[:, i], b_labels[:, i].type_as(logits))
            loss += label_loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        train_loss_set.append(loss.item())
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss / nb_tr_steps))

    ###########################################################################

    # Validation
    model.eval()
    logit_preds, true_labels, pred_labels = [], [], []

    for i, batch in enumerate(validation_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            b_logit_pred = outs[0]
            pred_label = torch.sigmoid(b_logit_pred)

            b_logit_pred = b_logit_pred.detach().cpu().numpy()
            pred_label = pred_label.cpu().numpy()
            b_labels = b_labels.cpu().numpy()

        logit_preds.append(b_logit_pred)
        pred_labels.append(pred_label)
        true_labels.append(b_labels)

    # Flatten outputs
    pred_labels = [item for sublist in pred_labels for item in sublist]
    true_labels = [item for sublist in true_labels for item in sublist]

    # Calculate Accuracy
    threshold = 0.50
    pred_bools = [pl > threshold for pl in pred_labels]
    true_bools = [tl == 1 for tl in true_labels]
    val_f1_accuracy = f1_score(true_bools, pred_bools, average='micro') * 100
    val_flat_accuracy = accuracy_score(true_bools, pred_bools) * 100

    print('F1 Validation Accuracy: ', val_f1_accuracy)
    print('Flat Validation Accuracy: ', val_flat_accuracy)


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Train loss: 1.2333967415985756


Epoch:  33%|███▎      | 1/3 [32:10<1:04:21, 1930.68s/it]

F1 Validation Accuracy:  50.440511759748716
Flat Validation Accuracy:  82.6489814050898
Train loss: 0.8746429313275129


Epoch:  67%|██████▋   | 2/3 [1:04:01<31:58, 1918.99s/it]

F1 Validation Accuracy:  45.33892572047084
Flat Validation Accuracy:  81.25277654375833
Train loss: 0.7434111261641143


Epoch: 100%|██████████| 3/3 [1:35:48<00:00, 1916.05s/it]

F1 Validation Accuracy:  51.57381177211205
Flat Validation Accuracy:  84.7179031541537





In [41]:
torch.save(model.state_dict(), 'roberta_model_weighted_comment')