In [None]:
import pandas as pd

df_train = pd.read_csv('train_preprocess.csv')
df_test = pd.read_csv('test_preprocess.csv')

print(df_train.head())
print(df_test.head())

In [None]:
# Checking for null values
print(df_train.isnull().sum())
print(df_test.isnull().sum())


In [3]:
df_train = df_train.dropna(subset=['comment_text'])
df_test = df_test.dropna(subset=['comment_text'])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_labels = df_train[labels].sum().sort_values(ascending=False)
sns.barplot(x = train_labels.values, y = train_labels.index)
plt.title('Training Data Label Distribution')
plt.xlabel('Number of Instances')
plt.ylabel('Labels')
plt.show()

In [None]:
# For train data
# Get a count of different data types within 'comment_text'
print("\nData type counts in 'comment_text':")
print(df_train['comment_text'].apply(type).value_counts())

# Display the number of non-string entries
non_string_count = df_train[~df_train['comment_text'].apply(lambda x: isinstance(x, str))].shape[0]
print(f"\nNumber of non-string 'comment_text' entries: {non_string_count}")

# Remove rows where 'comment_text' is not a string
train_df_cleaned = df_train[df_train['comment_text'].apply(lambda x: isinstance(x, str))].reset_index(drop=True)

# Confirm removal
non_string_count_after = train_df_cleaned[~train_df_cleaned['comment_text'].apply(lambda x: isinstance(x, str))].shape[0]
print(f"Number of non-string 'comment_text' entries after cleaning: {non_string_count_after}")


# For test data
# Get a count of different data types within 'comment_text'
print("\nData type counts in 'comment_text':")
print(df_test['comment_text'].apply(type).value_counts())

# Display the number of non-string entries
non_string_count = df_test[~df_test['comment_text'].apply(lambda x: isinstance(x, str))].shape[0]
print(f"\nNumber of non-string 'comment_text' entries: {non_string_count}")

# Remove rows where 'comment_text' is not a string
df_test_cleaned = df_test[df_test['comment_text'].apply(lambda x: isinstance(x, str))].reset_index(drop=True)

# Confirm removal
non_string_count_after = df_test_cleaned[~df_test_cleaned['comment_text'].apply(lambda x: isinstance(x, str))].shape[0]
print(f"Number of non-string 'comment_text' entries after cleaning: {non_string_count_after}")

In [None]:
# Splitting Data to train and validation
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

X = train_df_cleaned['comment_text'].values
y = train_df_cleaned[labels].values

mss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=41)

for train_index, val_index in mss.split(X,y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
print(f'Training samples: {len(X_train)}')
print(f'Validation samples: {len(X_val)}')


In [None]:
from transformers import BertTokenizer
import re
import torch
from tqdm import tqdm

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Define maximum sequence length
MAX_LENGTH = 128

# def preprocess_text(text):
#     # Remove HTML tags
#     text = re.sub(r'<.*?>', '', text)
#     # Remove URLs
#     text = re.sub(r'http\S+', '', text)
#     # Remove special characters (optional)
#     text = re.sub(r'[^A-Za-z0-9\s]+', '', text)
#     return text

# Apply preprocessing
X_train = [text for text in X_train]
X_val = [text for text in X_val]
test_comments = [text for text in df_test_cleaned['comment_text'].values]
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LENGTH = 128
BATCH_SIZE = 1000  # Adjust based on your system's memory

def batch_tokenize(texts, tokenizer, max_length, batch_size):
    encodings = {
        'input_ids': [],
        'attention_mask': []
    }
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing"):
        batch_texts = texts[i:i+batch_size]
        batch_encodings = tokenizer(
            batch_texts,
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt'
        )
        encodings['input_ids'].append(batch_encodings['input_ids'])
        encodings['attention_mask'].append(batch_encodings['attention_mask'])
    
    # Concatenate all batches
    encodings['input_ids'] = torch.cat(encodings['input_ids'])
    encodings['attention_mask'] = torch.cat(encodings['attention_mask'])
    
    return encodings

# Tokenize training and validation data
train_encodings = batch_tokenize(X_train, tokenizer, MAX_LENGTH, BATCH_SIZE)
val_encodings = batch_tokenize(X_val, tokenizer, MAX_LENGTH, BATCH_SIZE)
test_encodings = batch_tokenize(test_comments, tokenizer, MAX_LENGTH, BATCH_SIZE)

In [None]:
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.get_device_name(0))  # Prints your GPU name


In [None]:
# Creating Pytorch Datasets
import torch.utils
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, average_precision_score
from tqdm.auto import tqdm
from transformers import EarlyStoppingCallback
import numpy as np

class ToxicDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels  # None for test data

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])
    
train_dataset = ToxicDataset(train_encodings, y_train)
val_dataset = ToxicDataset(val_encodings, y_val)
test_dataset = ToxicDataset(test_encodings)  # No labels

from sklearn.utils.class_weight import compute_class_weight

class_weights = {}

for i,label in enumerate(labels):
    cw = compute_class_weight(
        class_weight="balanced",
        classes= np.array([0,1]),
        y=y_train[:,i]
    )
    class_weights[label] = np.log1p(cw[1])

class_weights_list = [class_weights[label] for label in labels]
print("Class Weights:", class_weights_list)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = len(labels), problem_type="multi_label_classification")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
# 7. Define Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions
    probs = torch.sigmoid(torch.tensor(preds)).numpy()
    binary_preds = (probs >= 0.5).astype(int)
    
    f1 = f1_score(labels, binary_preds, average='macro', zero_division=0)
    precision = precision_score(labels, binary_preds, average='macro', zero_division=0)
    recall = recall_score(labels, binary_preds, average='macro', zero_division=0)
    auc = roc_auc_score(labels, probs, average='macro')
    avg_precision = average_precision_score(labels, probs, average='macro')
    
    return {
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'auc': auc,
        'average_precision': avg_precision
    }

# 8. Set Up Training Arguments
training_args = TrainingArguments(
    output_dir='./results',                   # Output directory
    num_train_epochs=7,                       # Number of training epochs
    per_device_train_batch_size=32,           # Batch size per device during training
    per_device_eval_batch_size=32,            # Batch size for evaluation
    learning_rate=2e-5,  # Lower learning rate for fine-tuning
    weight_decay=0.01,                        # Strength of weight decay
    logging_dir='./logs',                     # Directory for storing logs
    logging_steps=10,                         # Log every 10 steps
    evaluation_strategy="epoch",              # Evaluate at the end of each epoch
    save_strategy="epoch",                    # Save checkpoint at each epoch
    load_best_model_at_end=True,              # Load the best model when finished training
    metric_for_best_model="f1",               # Best model is determined by the F1 score
    greater_is_better=True,                   # Whether a higher metric score is better
    save_total_limit=2,                       # Limit the total amount of checkpoints
    seed=42,                                   # Random seed for reproducibility
    fp16=True,                                 # Enable mixed precision training (requires compatible hardware)
    report_to="tensorboard",
    max_grad_norm= 1.0
    
)


import torch.nn as nn

class WeightedBCELoss(nn.Module):
    def __init__(self, class_weights):
        super(WeightedBCELoss, self).__init__()
        # Convert class_weights to a tensor; don't move to device here
        self.class_weights = torch.tensor(class_weights).float()

    def forward(self, logits, targets):
        # Ensure class_weights are on the same device as logits
        self.class_weights = self.class_weights.to(logits.device)
        # Compute Binary Cross-Entropy with Logits Loss without reduction
        bce_loss = nn.BCEWithLogitsLoss(reduction='none')(logits, targets)
        # Apply class weights
        weighted_loss = bce_loss * self.class_weights
        # Return the mean loss
        return weighted_loss.mean()


class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is None:
            raise ValueError("class_weights must be provided")
        self.weighted_loss = WeightedBCELoss(class_weights).to(self.model.device)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Move inputs to the same device as the model
        inputs = {key: val.to(model.device) for key, val in inputs.items() if isinstance(val, torch.Tensor)}
        
        labels = inputs.pop("labels")  # Extract labels
        outputs = model(**inputs)      # Forward pass
        logits = outputs.logits        # Get logits from model outputs
        
        # Compute weighted loss
        loss = self.weighted_loss(logits, labels)
        # Debugging: Print device info
        # print(f"Logits device: {logits.device}")
        # print(f"Labels device: {labels.device}")
        # print(f"Class Weights device: {self.weighted_loss.class_weights.device}")
        return (loss, outputs) if return_outputs else loss

In [None]:
from transformers import Trainer
#8. Trainer arguments
trainer = WeightedTrainer(
    model=model,                             # The instantiated Transformers model to be trained
    args=training_args,                      # Training arguments, defined above
    train_dataset=train_dataset,             # Training dataset
    eval_dataset=val_dataset,                # Evaluation dataset
    compute_metrics=compute_metrics,         # Function to compute metrics
    class_weights=class_weights_list
)

# 10. Start Training
trainer.train()

In [None]:
# 11. Evaluate Model
results = trainer.evaluate()
print("Evaluation Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

In [None]:
trainer.save_model("./model")  # Saves the model and tokenizer
tokenizer.save_pretrained("./model")  # Optional: Save tokenizer separately

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix

# Assuming you are using Hugging Face's Trainer
outputs = trainer.predict(val_dataset)  # Evaluate on the validation dataset

# Extract predictions and true labels
predictions = outputs.predictions  # Raw logits
true_labels = outputs.label_ids     # True labels from the dataset

# Convert logits to probabilities
probabilities = torch.sigmoid(torch.tensor(predictions)).numpy()

# Convert probabilities to binary predictions (threshold = 0.5 by default)
binary_predictions = (probabilities >= 0.5).astype(int)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Compute confusion matrices for each label
confusion_matrices = {}
for i, label in enumerate(labels):
    cm = confusion_matrix(true_labels[:, i], binary_predictions[:, i])
    confusion_matrices[label] = cm

# Display confusion matrices
for label, cm in confusion_matrices.items():
    plt.figure(figsize=(5, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
    plt.title(f"Confusion Matrix for {label}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

from sklearn.metrics import precision_score, recall_score, f1_score

# Compute metrics for each label
for i, label in enumerate(labels):
    precision = precision_score(true_labels[:, i], binary_predictions[:, i], zero_division=0)
    recall = recall_score(true_labels[:, i], binary_predictions[:, i], zero_division=0)
    f1 = f1_score(true_labels[:, i], binary_predictions[:, i], zero_division=0)
    print(f"Metrics for {label}: Precision={precision:.2f}, Recall={recall:.2f}, F1={f1:.2f}")

