# Jigsaw Rate Severity of Toxic Comments | Checkpoint 2


### Imports

In [35]:
import pandas as pd
import numpy as np
import re
import gc
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from tqdm import tqdm
import torch.cuda.amp as amp

##### Checking for cuda and clearing cash

In [36]:
# Check CUDA availability and set device
print("CUDA Available:", torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True
gc.collect()

CUDA Available: True
Device: cuda


40

#### Loading the Training Data

In [37]:
# Load the data
train_df = pd.read_csv("train.csv")
print("Train Data Sample:")
print(train_df.head())
print(train_df.info())

Train Data Sample:
                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count  

#### Cleaning text | Optional

In [38]:
# Cleaning of the text
def clean_text(text):
    text = text.strip()
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text) # REmoves special characters
    return text

train_df["clean_text"] = train_df["comment_text"].apply(clean_text)

## Creating Pytorch Dataset for Training

In [39]:
from torch.utils.data import Dataset, DataLoader

label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

class ToxicCommentsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=256):
        self.tokenizer = tokenizer
        self.texts = dataframe["clean_text"].tolist()
        # Convert labels to float tensor for multi-label classification
        self.labels = dataframe[label_cols].values.astype(float)
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        # Tokenize the text
        tokens = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        # Squeeze to remove extra dimension
        input_ids = tokens["input_ids"].squeeze(0)
        attention_mask = tokens["attention_mask"].squeeze(0)
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": label
        }


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Create Dataset and DataLoader
train_dataset = ToxicCommentsDataset(train_df, tokenizer, max_length=256)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0, pin_memory=True)

## Model Setup

In [40]:
# Initialize the model for multi-label classification (6 labels)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6)
model.to(device)

# loss function BCEWithLogitsLoss for multi-label classification
loss_fn = nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)

# Enable automatic mixed precision
scaler = amp.GradScaler()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = amp.GradScaler()


## Training Loop

In [41]:
num_epochs = 1 

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    loop = tqdm(train_dataloader, desc=f"Epoch {epoch+1}")
    
    for batch in loop:
        optimizer.zero_grad()
        
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        with amp.autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits  
            loss = loss_fn(logits, labels)
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        epoch_loss += loss.item()
        loop.set_postfix(loss=loss.item())
    
    print(f"Epoch {epoch+1} completed. Average Loss: {epoch_loss/len(train_dataloader):.4f}")

print("Training complete!")

  with amp.autocast():
Epoch 1: 100%|███████████████████████████████████████████████████████| 9974/9974 [41:29<00:00,  4.01it/s, loss=0.00098]

Epoch 1 completed. Average Loss: 0.0501
Training complete!





### Prediction and Submission for Kaggle

In [43]:
def predict_score(text):
    model.eval()
    tokens = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=256,
        return_tensors="pt"
    )
    tokens = {key: val.to(device) for key, val in tokens.items()}
    
    with torch.no_grad():
        outputs = model(**tokens)
        logits = outputs.logits
        probabilities = torch.sigmoid(logits).squeeze().tolist()  # Convert to list

    # Assign different weights to different toxicity labels
    weights = [1.0, 1.7, 1.3, 1.4, 1.2, 1.5]  # Adjust as needed

    # Compute weighted sum instead of simple sum
    score = sum(p * w for p, w in zip(probabilities, weights))

    return score

# Load test data for scoring
test_df = pd.read_csv("comments_to_score.csv")
print("Test Data Sample:")
print(test_df.head())
print(test_df.info())

# Apply prediction function
print("Predicting toxicity scores...")
test_df["score"] = test_df["text"].apply(predict_score)

# Create the submission file
submission_df = test_df[["comment_id", "score"]]
submission_df.to_csv("submission1.csv", index=False)
print("Submission file saved as submission.csv")

Test Data Sample:
   comment_id                                               text
0      114890  "\n \n\nGjalexei, you asked about whether ther...
1      732895  Looks like be have an abuser , can you please ...
2     1139051  I confess to having complete (and apparently b...
3     1434512  "\n\nFreud's ideas are certainly much discusse...
4     2084821  It is not just you. This is a laundry list of ...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7537 entries, 0 to 7536
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   comment_id  7537 non-null   int64 
 1   text        7537 non-null   object
dtypes: int64(1), object(1)
memory usage: 117.9+ KB
None
Predicting toxicity scores...
Submission file saved as submission.csv
