<a href="https://colab.research.google.com/github/charaannn/snapsort/blob/main/snapsort.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import torch
from transformers import BertTokenizer

# Load the dataset (Make sure train.csv is in the same directory or provide the full path)
df = pd.read_csv("/content/train.csv")

# Display dataset info
print("Dataset Info:")
print(df.info())

# Show first few rows
print("\nSample Data:")
print(df.head())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Drop rows with missing comments
df = df.dropna(subset=["comment_text"])

# Define preprocessing function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = text.strip()  # Remove leading/trailing spaces
    return text

# Apply preprocessing
df["clean_comment"] = df["comment_text"].apply(clean_text)

# Display cleaned text
print("\nCleaned Text Samples:")
print(df[["comment_text", "clean_comment"]].head())

# Initialize BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize a sample comment
sample_text = df["clean_comment"].iloc[0]
tokens = tokenizer(sample_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

print("\nTokenized Sample Comment:")
print(tokens)

# Convert labels (Toxic = 1, Non-Toxic = 0)
label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
df["toxic_label"] = df[label_cols].max(axis=1)  # If any label is 1, classify as toxic (1)

# Display label distribution
print("\nLabel Distribution:")
print(df["toxic_label"].value_counts())

# Save preprocessed data
df.to_csv("cleaned_train.csv", index=False)
print("\nPreprocessed dataset saved as 'cleaned_train.csv'.")


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB
None

Sample Data:
                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]


Tokenized Sample Comment:
{'input_ids': tensor([[  101,  7526,  2339,  1996, 10086,  2015,  2081,  2104,  2026,  5310,
         18442, 13076, 12392,  2050,  5470,  2020, 16407,  2027,  4694,  1056,
          3158,  9305, 22556,  2074,  8503,  2006,  2070,  3806,  2044,  1045,
          5444,  2012,  2047,  2259, 14421,  6904,  2278,  1998,  3531,  2123,
          1056,  6366,  1996, 23561,  2013,  1996,  2831,  3931,  2144,  1045,
          1049,  3394,  2085,  6486, 16327,  4229,  2676,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

Label Distribution:
toxic_label
0    143346
1     1

In [None]:
print(df.dtypes)

id               object
comment_text     object
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
clean_comment    object
toxic_label       int64
dtype: object


In [None]:
def __getitem__(self, index):
    comment = str(self.data.loc[index, "comment_text"])  # Use the correct text column

    # Select only numerical label columns
    label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    labels = self.data.loc[index, label_cols].values.astype(float)  # Ensure labels are numeric

    # Tokenize the comment
    tokens = self.tokenizer(
        comment,
        padding="max_length",
        truncation=True,
        max_length=self.max_length,
        return_tensors="pt"
    )

    return {
        "input_ids": tokens["input_ids"].squeeze(0),
        "attention_mask": tokens["attention_mask"].squeeze(0),
        "labels": torch.tensor(labels, dtype=torch.float)
    }


In [None]:
!pip install transformers torch



In [None]:
print(df.head())  # Check column names
print(df.dtypes)  # Check data types

                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  \
0             0        0       0       0              0   
1             0        0       0       0              0   
2             0        0       0       0              0   
3             0        0       0       0              0   
4             0        0       0       0              0   

                                       clean_comment  toxic_label  
0  explanation why the edits made under my userna...            0  
1  d aww he matches th

In [None]:
import torch
from torch.utils.data import Dataset

class ToxicCommentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Define the label columns explicitly
        self.label_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

        # Ensure labels are floats (for PyTorch)
        self.data[self.label_columns] = self.data[self.label_columns].astype(float)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        comment = str(self.data.loc[index, "comment_text"])  # Get text
        labels = torch.tensor(self.data.loc[index, self.label_columns].values.astype(float), dtype=torch.float)


        # Tokenize text
        encoding = self.tokenizer(
            comment,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": labels,
        }

# Reload dataset with fixed code
dataset = ToxicCommentDataset(df, tokenizer)

# Check a sample
print(dataset[0])


{'input_ids': tensor([  101,  7526,  2339,  1996, 10086,  2015,  2081,  2104,  2026,  5310,
        18442, 13076, 12392,  2050,  5470,  2020, 16407,  1029,  2027,  4694,
         1005,  1056,  3158,  9305, 22556,  1010,  2074,  8503,  2006,  2070,
         3806,  2044,  1045,  5444,  2012,  2047,  2259, 14421,  6904,  2278,
         1012,  1998,  3531,  2123,  1005,  1056,  6366,  1996, 23561,  2013,
         1996,  2831,  3931,  2144,  1045,  1005,  1049,  3394,  2085,  1012,
         6486,  1012, 16327,  1012,  4229,  1012,  2676,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

In [None]:
from torch.utils.data import DataLoader

batch_size = 16  # Adjust based on your system
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Check one batch
for batch in train_dataloader:
    print(batch)
    break


{'input_ids': tensor([[ 101, 1000, 1045,  ...,    0,    0,    0],
        [ 101, 2017, 2064,  ...,    0,    0,    0],
        [ 101, 1045, 6592,  ...,    0,    0,    0],
        ...,
        [ 101, 2313, 3985,  ...,    0,    0,    0],
        [ 101, 4487, 9284,  ...,    0,    0,    0],
        [ 101, 2175, 3805,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[1., 0., 0., 0., 0., 0.],
        [1., 0., 1., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
    

In [None]:
from transformers import DistilBertModel
import torch.nn as nn
import torch

class ToxicCommentClassifier(nn.Module):
    def __init__(self, num_labels):
        super(ToxicCommentClassifier, self).__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token representation
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits


In [None]:
num_labels = 6  # Adjust based on your dataset
model = ToxicCommentClassifier(num_labels)


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
import torch.optim as optim
from torch.nn import BCEWithLogitsLoss

criterion = BCEWithLogitsLoss()  # For multi-label classification
optimizer = optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


ToxicCommentClassifier(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1):

In [None]:
import torch
print(torch.cuda.is_available())  # Should print True
print(torch.cuda.get_device_name(0))  # Should show "Tesla T4" or similar

True
Tesla T4


In [None]:
# Import necessary libraries
import torch
from transformers import DistilBertForSequenceClassification

# Define the model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=6)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
for batch in train_dataloader:
    input_ids, attention_mask, labels = [t.to(device) for t in batch]

In [None]:
from torch.utils.data import Dataset
class CustomDataset(Dataset):
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(input_ids[idx]),
            'attention_mask': torch.tensor(attention_mask[idx]),
            'labels': torch.tensor(labels[idx])
        }


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
import torch.nn as nn

# Example data (replace with your dataset)
train_inputs = torch.randint(0, 30522, (100, 128))  # 100 samples, each with 128 tokens
train_masks = torch.ones(100, 128)  # Attention masks (all ones for simplicity)
train_labels = torch.randint(0, 6, (100,))  # 100 labels (assuming 6 classes)

# Create TensorDataset (returns a tuple, not a dictionary)
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)

# Define DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Define a simple model for testing (replace with your model)
class SimpleModel(nn.Module):
    def __init__(self, num_classes=6):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(128, num_classes)  # Simple linear classifier

    def forward(self, input_ids, attention_mask):
        return self.fc(input_ids.float())  # Dummy forward pass

model = SimpleModel()  # Initialize model

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Define loss function
criterion = nn.CrossEntropyLoss()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training Loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_loss = 0

    for batch in train_dataloader:
        optimizer.zero_grad()

        # Unpack batch correctly (since TensorDataset returns a tuple)
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = (
            input_ids.to(device),
            attention_mask.to(device),
            labels.to(device),
        )

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {total_loss:.4f}")

print("Training Complete!")


Epoch 1/3 - Loss: 69949.0659
Epoch 2/3 - Loss: 64656.6528
Epoch 3/3 - Loss: 62143.0940
Training Complete!


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
import torch.nn as nn

# Example data (replace with your dataset)
train_inputs = torch.randint(0, 30522, (100, 128)).float()  # Convert to float
train_masks = torch.ones(100, 128).float()  # Convert to float
train_labels = torch.randint(0, 6, (100,))  # Labels (no need to convert)

# Create TensorDataset
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)

# Define DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Define a simple model for testing (replace with your model)
class SimpleModel(nn.Module):
    def __init__(self, num_classes=6):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(128, num_classes)  # Simple linear classifier

    def forward(self, input_ids, attention_mask):
        return self.fc(input_ids)  # No need to apply softmax

model = SimpleModel()  # Initialize model

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Define loss function
criterion = nn.CrossEntropyLoss()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training Loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_loss = 0

    for batch in train_dataloader:
        optimizer.zero_grad()

        # Unpack batch correctly
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = (
            input_ids.to(device).float(),
            attention_mask.to(device).float(),
            labels.to(device),
        )

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {total_loss:.4f}")

print("Training Complete!")


Epoch 1/3 - Loss: 79135.0391
Epoch 2/3 - Loss: 78494.4404
Epoch 3/3 - Loss: 85267.2095
Training Complete!


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
import torch.nn as nn

# Example data (replace with real dataset)
train_inputs = torch.randint(0, 30522, (100, 128)).float()  # Convert to float
train_masks = torch.ones(100, 128).float()  # Convert to float
train_labels = torch.randint(0, 6, (100,))  # Labels (integer)

# Create TensorDataset
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)

# Define DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Define a better model
class ImprovedModel(nn.Module):
    def __init__(self, num_classes=6):
        super(ImprovedModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes)  # Output layer
        )

    def forward(self, input_ids, attention_mask):
        return self.fc(input_ids)  # No softmax

model = ImprovedModel()  # Initialize model

# Define optimizer and loss
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
criterion = nn.CrossEntropyLoss()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training Loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        optimizer.zero_grad()

        # Unpack batch correctly
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = (
            input_ids.to(device).float(),
            attention_mask.to(device).float(),
            labels.to(device).long(),  # Convert labels to LongTensor
        )

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {total_loss:.4f}")

print("Training Complete!")


Epoch 1/3 - Loss: 49406.1885
Epoch 2/3 - Loss: 46548.6963
Epoch 3/3 - Loss: 38341.0500
Training Complete!


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
import torch.nn as nn
import torch.nn.functional as F

# Example data (Replace with real dataset)
train_inputs = torch.randint(0, 30522, (100, 128)).float()
train_masks = torch.ones(100, 128).float()
train_labels = torch.randint(0, 6, (100,))

# Create TensorDataset
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)

# Define DataLoader with larger batch size
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define a better model
class ImprovedModel(nn.Module):
    def __init__(self, num_classes=6):
        super(ImprovedModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(128, 256),
            nn.LayerNorm(256),  # Normalize activations
            nn.ReLU(),
            nn.Dropout(0.3),  # Prevent overfitting
            nn.Linear(256, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        return self.fc(input_ids)

# Initialize model
model = ImprovedModel()

# Better optimizer & loss function
optimizer = AdamW(model.parameters(), lr=1e-4, eps=1e-8)
criterion = nn.CrossEntropyLoss()

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training Loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        optimizer.zero_grad()

        # Unpack batch correctly
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = (
            input_ids.to(device).float(),
            attention_mask.to(device).float(),
            labels.to(device).long(),
        )

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {total_loss:.4f}")

print("Training Complete!")


Epoch 1/5 - Loss: 7.4001
Epoch 2/5 - Loss: 7.4776
Epoch 3/5 - Loss: 7.1901
Epoch 4/5 - Loss: 6.9545
Epoch 5/5 - Loss: 7.2168
Training Complete!


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Example comments (replace with real dataset)
comments = [
    "This is the best day ever!",
    "I hate this so much.",
    "You are amazing!",
    "This is terrible, never again!",
    "Neutral opinion here.",
]

# Tokenize comments
encodings = tokenizer(comments, padding=True, truncation=True, max_length=128, return_tensors="pt")
train_inputs, train_masks = encodings["input_ids"], encodings["attention_mask"]

# Example labels (1 = toxic, 0 = neutral)
train_labels = torch.tensor([0, 1, 0, 1, 0])

# Create TensorDataset
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)

# Define DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# Define Model with BERT
class BertClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(768, num_classes)  # BERT's hidden size is 768

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # [batch_size, 768]
        x = self.dropout(pooled_output)
        logits = self.fc(x)  # [batch_size, num_classes]
        return logits

# Initialize Model
model = BertClassifier()

# Optimizer with weight decay
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8, weight_decay=1e-4)

# Loss function
criterion = nn.CrossEntropyLoss()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training Loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in train_dataloader:
        optimizer.zero_grad()

        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = (
            input_ids.to(device),
            attention_mask.to(device),
            labels.to(device)
        )

        outputs = model(input_ids, attention_mask)  # Logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Compute accuracy
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    accuracy = 100 * correct / total
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {total_loss:.4f} - Accuracy: {accuracy:.2f}%")

print("Training Complete!")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1/5 - Loss: 1.9835 - Accuracy: 40.00%
Epoch 2/5 - Loss: 1.8305 - Accuracy: 60.00%
Epoch 3/5 - Loss: 1.2910 - Accuracy: 100.00%
Epoch 4/5 - Loss: 0.9263 - Accuracy: 100.00%
Epoch 5/5 - Loss: 1.2160 - Accuracy: 100.00%
Training Complete!


In [None]:
import pandas as pd

df = pd.read_csv("/content/test.csv")
print(df.columns)

Index(['id', 'comment_text'], dtype='object')


In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import BertTokenizer, BertModel

# Define the BERT-based classifier
class BertClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(BertClassifier, self).__init__()
        # Load pre-trained BERT model
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(768, num_classes)  # 768 is BERT's hidden size

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Use the pooler output for classification
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertClassifier(num_classes=2)

# Optionally load your trained weights:
# model.load_state_dict(torch.load("path_to_your_trained_model.pt"))

# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set model to evaluation mode

# Load your test CSV (ensure the file is in the same directory or provide full path)
df_test = pd.read_csv("test.csv")

# Check the columns to be sure – expected columns: "id" and "comment_text"
print("Test CSV Columns:", df_test.columns)

# Get list of comments from the "comment_text" column
comments = df_test["comment_text"].tolist()

# Tokenize the comments using BERT tokenizer
encodings = tokenizer(comments, padding=True, truncation=True, max_length=128, return_tensors="pt")
input_ids = encodings["input_ids"].to(device)
attention_mask = encodings["attention_mask"].to(device)

# Run inference
with torch.no_grad():
    outputs = model(input_ids, attention_mask)
    # For binary classification, argmax gives the predicted label (0 or 1)
    preds = torch.argmax(outputs, dim=1)

# Display predictions for each comment
for comment, pred in zip(comments, preds.cpu().numpy()):
    print(f"Comment: {comment}\nPrediction: {pred}\n")


Test CSV Columns: Index(['id', 'comment_text'], dtype='object')


OutOfMemoryError: CUDA out of memory. Tried to allocate 56.09 GiB. GPU 0 has a total capacity of 14.74 GiB of which 11.07 GiB is free. Process 4102 has 3.66 GiB memory in use. Of the allocated memory 3.31 GiB is allocated by PyTorch, and 233.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, TensorDataset

# Define the BERT-based classifier
class BertClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(BertClassifier, self).__init__()
        # Load pre-trained BERT model
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(768, num_classes)  # BERT's hidden size is 768

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # [batch_size, 768]
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)  # [batch_size, num_classes]
        return logits

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertClassifier(num_classes=2)

# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set model to evaluation mode

# Load test CSV
df_test = pd.read_csv("test.csv")  # CSV should have columns like 'id' and 'comment_text'
print("Columns in test CSV:", df_test.columns)

# Get comments from the 'comment_text' column
comments = df_test["comment_text"].tolist()

# Tokenize the comments
encodings = tokenizer(comments, padding=True, truncation=True, max_length=128, return_tensors="pt")
input_ids = encodings["input_ids"]
attention_mask = encodings["attention_mask"]

# Create a TensorDataset and DataLoader for inference
test_dataset = TensorDataset(input_ids, attention_mask)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Run inference in batches
predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        batch_input_ids, batch_attention_mask = [tensor.to(device) for tensor in batch]
        outputs = model(batch_input_ids, batch_attention_mask)
        preds = torch.argmax(outputs, dim=1)
        predictions.extend(preds.cpu().numpy())

# Optionally, add predictions to the DataFrame
df_test['predicted_label'] = predictions

# Print out predictions for verification
print(df_test[['id', 'comment_text', 'predicted_label']])


Columns in test CSV: Index(['id', 'comment_text'], dtype='object')


KeyboardInterrupt: 

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, TensorDataset

# Define the BERT-based classifier
class BertClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(768, num_classes)  # BERT's hidden size is 768

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # [batch_size, 768]
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)  # [batch_size, num_classes]
        return logits

# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertClassifier(num_classes=2)
model.to(device)
model.eval()  # Set model to evaluation mode

# Load test CSV
df_test = pd.read_csv("test.csv")  # CSV should have columns like 'id' and 'comment_text'
print("✅ Loaded test.csv successfully!")

# Ensure column 'comment_text' exists
if "comment_text" not in df_test.columns:
    raise KeyError("Column 'comment_text' not found in the CSV file!")

# Get comments from 'comment_text' column
comments = df_test["comment_text"].astype(str).tolist()

# Tokenize the comments
encodings = tokenizer(comments, padding=True, truncation=True, max_length=128, return_tensors="pt")
input_ids = encodings["input_ids"]
attention_mask = encodings["attention_mask"]

# Create a TensorDataset and DataLoader for inference
test_dataset = TensorDataset(input_ids, attention_mask)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=2, pin_memory=True)

# Run inference in batches
predictions = []
print("✅ Starting inference...")

with torch.no_grad():
    for batch_idx, batch in enumerate(test_dataloader):
        batch_input_ids, batch_attention_mask = [tensor.to(device) for tensor in batch]

        outputs = model(batch_input_ids, batch_attention_mask)
        preds = torch.argmax(outputs, dim=1)

        predictions.extend(preds.cpu().numpy())

        # Print progress every 10 batches
        if batch_idx % 10 == 0:
            print(f"✅ Processed {batch_idx * len(batch_input_ids)} comments...")

# Add predictions to the DataFrame
df_test["predicted_label"] = predictions

# Save predictions to a new CSV file
df_test.to_csv("test_predictions.csv", index=False)

print("✅ Inference complete! Predictions saved to 'test_predictions.csv'.")


✅ Loaded test.csv successfully!
✅ Starting inference...
✅ Processed 0 comments...
✅ Processed 40 comments...
✅ Processed 80 comments...
✅ Processed 120 comments...
✅ Processed 160 comments...
✅ Processed 200 comments...
✅ Processed 240 comments...
✅ Processed 280 comments...
✅ Processed 320 comments...
✅ Processed 360 comments...
✅ Processed 400 comments...
✅ Processed 440 comments...
✅ Processed 480 comments...
✅ Processed 520 comments...
✅ Processed 560 comments...
✅ Processed 600 comments...
✅ Processed 640 comments...
✅ Processed 680 comments...
✅ Processed 720 comments...
✅ Processed 760 comments...
✅ Processed 800 comments...
✅ Processed 840 comments...
✅ Processed 880 comments...
✅ Processed 920 comments...
✅ Processed 960 comments...
✅ Processed 1000 comments...
✅ Processed 1040 comments...
✅ Processed 1080 comments...
✅ Processed 1120 comments...
✅ Processed 1160 comments...
✅ Processed 1200 comments...
✅ Processed 1240 comments...
✅ Processed 1280 comments...
✅ Processed 1320

In [None]:
import pandas as pd
df = pd.read_csv("test_predictions.csv")
print(df.head(20))

                  id                                       comment_text  \
0   00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...   
1   0000247867823ef7  == From RfC == \n\n The title is fine as it is...   
2   00013b17ad220c46  " \n\n == Sources == \n\n * Zawe Ashton on Lap...   
3   00017563c3f7919a  :If you have a look back at the source, the in...   
4   00017695ad8997eb          I don't anonymously edit articles at all.   
5   0001ea8717f6de06  Thank you for understanding. I think very high...   
6   00024115d4cbde0f  Please do not add nonsense to Wikipedia. Such ...   
7   000247e83dcc1211                   :Dear god this site is horrible.   
8   00025358d4737918  " \n Only a fool can believe in such numbers. ...   
9   00026d1092fe71cc  == Double Redirects == \n\n When fixing double...   
10  0002eadc3b301559  I think its crap that the link to roggenbier i...   
11  0002f87b16116a7f  "::: Somebody will invariably try to add Relig...   
12  0003806b11932181  , 2

In [None]:
torch.save(model.state_dict(), "model.pth")

In [None]:
from google.colab import files
files.download("model.pth")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>