In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_labels = pd.read_csv('test_labels.csv')

In [3]:
train = train.drop(columns=['id'])
test = test.drop(columns=['id'])
#train['comment_text'] = train['comment_text'].str.lower()

In [4]:
import re

# Define a function to remove punctuation using regular expressions
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

# Apply the function to the 'text' column
train['comment_text'] = train['comment_text'].apply(remove_punctuation)
test['comment_text'] = test['comment_text'].apply(remove_punctuation)

In [5]:
# Define a function to remove special characters using regular expressions
def remove_special_characters(text):
    # Define a regular expression pattern to match special characters
    pattern = r'[^a-zA-Z0-9\s]'  # This pattern matches any character that is not a letter, digit, or whitespace
    return re.sub(pattern, '', text)

# Apply the function to the 'text' column
train['comment_text'] = train['comment_text'].apply(remove_special_characters)
test['comment_text'] = test['comment_text'].apply(remove_special_characters)

In [6]:
train

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,Daww He matches this background colour Im seem...,0,0,0,0,0,0
2,Hey man Im really not trying to edit war Its j...,0,0,0,0,0,0
3,\nMore\nI cant make any real suggestions on im...,0,0,0,0,0,0
4,You sir are my hero Any chance you remember wh...,0,0,0,0,0,0
...,...,...,...,...,...,...,...
159566,And for the second time of asking when your vi...,0,0,0,0,0,0
159567,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,Spitzer \n\nUmm theres no actual article for p...,0,0,0,0,0,0
159569,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [7]:
from transformers import BertTokenizer, BertModel, BertPreTrainedModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Tokenize Comments - using max token length along with attention mask
tokenized_data = train['comment_text'].apply(lambda x: tokenizer(x, padding = 'max_length', max_length = 512, truncation = True, return_tensors='pt'))
tokenized_eval_data = test['comment_text'].apply(lambda x: tokenizer(x, padding = 'max_length', max_length = 512, truncation = True, return_tensors='pt'))

In [8]:
# Extract the labels
labels = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
eval_labels = test_labels[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
# Convert to a PyTorch tensor
labels_tensor = torch.tensor(labels, dtype=torch.float32)
eval_labels_tensor = torch.tensor(eval_labels, dtype=torch.float32)

In [9]:
# Custom dataset class
class ToxicCommentsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)


In [10]:
# Prepare the encodings
encodings = {
    'input_ids': torch.cat([x['input_ids'] for x in tokenized_data]),
    'attention_mask': torch.cat([x['attention_mask'] for x in tokenized_data])
}
# Create the dataset
dataset = ToxicCommentsDataset(encodings, labels_tensor)

In [11]:
# Prepare the encodings
eval_encodings = {
    'input_ids': torch.cat([x['input_ids'] for x in tokenized_eval_data]),
    'attention_mask': torch.cat([x['attention_mask'] for x in tokenized_eval_data])
}
# Create the dataset
eval_dataset = ToxicCommentsDataset(eval_encodings, eval_labels_tensor)

In [12]:
import torch.nn as nn

In [13]:
# Define a custom model for multi-label classification
class BertForMultiLabelClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size, num_labels)
        self.init_weights()

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)
        
        return (loss, logits) if loss is not None else logits

In [14]:
from transformers import Trainer, TrainingArguments, BertConfig

In [None]:
# Define the number of labels
num_labels = 6

# Create a configuration object with the appropriate number of labels
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Initialize the model with the configuration
model = BertForMultiLabelClassification.from_pretrained('bert-base-uncased', config=config)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=eval_dataset
)

# Train the model
trainer.train()

Some weights of BertForMultiLabelClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss



KeyboardInterrupt

