<a href="https://colab.research.google.com/github/eteitelbaum/code-satp/blob/Fall-2024/training-actiontype-distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.metrics import classification_report, hamming_loss, accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
# Prepare Dataset
class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.float)
        }


In [None]:

# Initialize Tokenizer and Model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=7,  # Number of labels
    problem_type="multi_label_classification"
)

data = pd.read_csv('/content/drive/MyDrive/SATP_data/action_type.csv')

# Prepare Data
X = data['incident_summary']
y = data[['armed_assault', 'arrest', 'bombing', 'infrastructure',
          'surrender', 'seizure', 'abduction']].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_dataset = MultiLabelDataset(X_train.tolist(), y_train, tokenizer, max_len=128)
test_dataset = MultiLabelDataset(X_test.tolist(), y_test, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

# Training Function
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    return total_loss / len(data_loader)

# Evaluation Function
def evaluate(model, data_loader, device):
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds.extend(torch.sigmoid(logits).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return preds, true_labels


In [None]:

# Train the Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

epochs = 3
for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}")


Epoch 1/3, Train Loss: 0.2008
Epoch 2/3, Train Loss: 0.0765
Epoch 3/3, Train Loss: 0.0544


In [None]:

# Evaluate the Model
predictions, true_labels = evaluate(model, test_loader, device)

# Convert predictions to binary (0/1)
threshold = 0.5
binary_preds = [ (pred > threshold).astype(int) for pred in predictions ]  # Apply thresholding element-wise to each array in the list

# Evaluate Results
print("Hamming Loss:", hamming_loss(true_labels, binary_preds))
print("Subset Accuracy:", accuracy_score(true_labels, binary_preds))
print("Classification Report:\n", classification_report(true_labels, binary_preds, target_names=data.columns[1:]))


Hamming Loss: 0.023893486865779058
Subset Accuracy: 0.8690176322418136
Classification Report:
                   precision    recall  f1-score   support

          arrest       0.96      0.93      0.95       723
         bombing       0.98      0.98      0.98       589
  infrastructure       0.87      0.95      0.91       226
       surrender       0.94      0.81      0.87       233
         seizure       0.99      0.99      0.99       142
       abduction       0.87      0.88      0.87       418
incident_summary       0.85      0.86      0.86       110

       micro avg       0.94      0.93      0.93      2441
       macro avg       0.92      0.92      0.92      2441
    weighted avg       0.94      0.93      0.93      2441
     samples avg       0.94      0.94      0.94      2441



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, hamming_loss, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# =======================
# Generalized Dataset Class
# =======================
class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.float),
        }

# =======================
# Function to Compute Metrics
# =======================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (torch.sigmoid(torch.tensor(logits)) > 0.5).numpy()  # Apply threshold
    labels = labels.astype(int)

    hamming = hamming_loss(labels, predictions)
    subset_acc = accuracy_score(labels, predictions)
    report = classification_report(labels, predictions, output_dict=True, zero_division=0)

    return {
        "hamming_loss": hamming,
        "subset_accuracy": subset_acc,
        "precision_micro": report["micro avg"]["precision"],
        "recall_micro": report["micro avg"]["recall"],
        "f1_micro": report["micro avg"]["f1-score"],
    }

from sklearn.metrics import classification_report, hamming_loss, accuracy_score

def compute_metricss(eval_pred):
    """
    Compute evaluation metrics for multi-label classification.
    Includes Hamming Loss, Subset Accuracy, and Classification Report for all labels.
    """
    logits, labels = eval_pred
    predictions = (torch.sigmoid(torch.tensor(logits)) > 0.5).numpy()  # Apply threshold
    labels = labels.astype(int)

    # Hamming Loss
    hamming = hamming_loss(labels, predictions)

    # Subset Accuracy
    subset_acc = accuracy_score(labels, predictions)

    # Classification Report
    report = classification_report(
        labels, predictions, target_names=data.columns[1:], zero_division=0, output_dict=True
    )

    # Print complete report for reference
    print("\nFull Classification Report:")
    print(classification_report(labels, predictions, target_names=data.columns[:-1], zero_division=0))

    # Summary Metrics for Trainer
    return {
        "hamming_loss": hamming,
        "subset_accuracy": subset_acc,
        "precision_micro": report["micro avg"]["precision"],
        "recall_micro": report["micro avg"]["recall"],
        "f1_micro": report["micro avg"]["f1-score"],
        "precision_macro": report["macro avg"]["precision"],
        "recall_macro": report["macro avg"]["recall"],
        "f1_macro": report["macro avg"]["f1-score"],
    }


# =======================
# Reusable Training Function
# =======================
def train_transformer_model(model_name, data, max_len=512, batch_size=40, epochs=3):
    """
    Generalized function to train a transformer model for multi-label classification.
    Args:
        model_name: Name of the pre-trained model (e.g., "bert-base-uncased", "distilbert-base-uncased").
        data: Pandas DataFrame with columns "incident_summary" and multi-label columns.
        max_len: Maximum sequence length.
        batch_size: Batch size for training and evaluation.
        epochs: Number of training epochs.
    """
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=data.shape[1] - 1,  # Number of labels (all columns except "incident_summary")
        problem_type="multi_label_classification",
    )
    model.to("cuda" if torch.cuda.is_available() else "cpu")

    # Split data into train, val, and test
    X = data["incident_summary"]
    y = data.drop('incident_summary', axis=1).values

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Create datasets
    train_dataset = MultiLabelDataset(X_train.tolist(), y_train, tokenizer, max_len)
    val_dataset = MultiLabelDataset(X_val.tolist(), y_val, tokenizer, max_len)
    test_dataset = MultiLabelDataset(X_test.tolist(), y_test, tokenizer, max_len)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="f1_micro",
        greater_is_better=True,
        save_total_limit=2,
        report_to="none",
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metricss,
    )

    # Train and Evaluate
    trainer.train()

    # Final Evaluation on Test Set
    test_results = trainer.evaluate(test_dataset)
    print("Test Set Results:", test_results)

    return trainer, test_results


In [None]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/SATP_data/action_type.csv')


In [None]:

# Train using BERT
trainer_bert, results_bert = train_transformer_model("bert-base-uncased", data)


In [None]:

# Train using DistilBERT 1
trainer_distilbert, results_distilbert = train_transformer_model("distilbert-base-uncased", data)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Hamming Loss,Subset Accuracy,Precision Micro,Recall Micro,F1 Micro,Runtime,Samples Per Second,Steps Per Second
1,0.1029,0.08839,0.02381,0.869624,0.954519,0.907499,0.930415,21.4264,69.447,4.34
2,0.0705,0.068383,0.021985,0.877016,0.949887,0.923372,0.936442,21.4854,69.256,4.329
3,0.0492,0.065309,0.019873,0.890457,0.948505,0.937603,0.943022,21.4478,69.378,4.336


Test Set Results: {'eval_loss': 0.07131125032901764, 'eval_hamming_loss': 0.021874700182289167, 'eval_subset_accuracy': 0.8817998656816656, 'eval_precision_micro': 0.9439822518025514, 'eval_recall_micro': 0.9305631492618918, 'eval_f1_micro': 0.9372246696035242, 'eval_runtime': 22.7003, 'eval_samples_per_second': 65.594, 'eval_steps_per_second': 4.141, 'epoch': 3.0}


In [None]:

# Train using DistilBERT 2
trainer_distilbert, results_distilbert = train_transformer_model("distilbert-base-uncased", data)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Hamming Loss,Subset Accuracy,Precision Micro,Recall Micro,F1 Micro,Precision Macro,Recall Macro,F1 Macro,Runtime,Samples Per Second,Steps Per Second
1,0.1548,0.13982,0.026978,0.849462,0.962874,0.880131,0.919645,0.951544,0.832531,0.882356,20.8199,71.47,1.825
2,0.0901,0.092384,0.021793,0.876344,0.959242,0.914614,0.936397,0.943639,0.896611,0.918414,20.8668,71.309,1.821



Full Classification Report:
                  precision    recall  f1-score   support

          arrest       0.97      0.90      0.94       539
         bombing       0.99      0.98      0.98       455
  infrastructure       0.91      0.84      0.87       176
       surrender       0.94      0.70      0.80       172
         seizure       0.99      0.99      0.99        96
       abduction       0.94      0.87      0.91       311
incident_summary       0.91      0.55      0.69        78

       micro avg       0.96      0.88      0.92      1827
       macro avg       0.95      0.83      0.88      1827
    weighted avg       0.96      0.88      0.92      1827
     samples avg       0.94      0.90      0.91      1827


Full Classification Report:
                  precision    recall  f1-score   support

          arrest       0.98      0.93      0.95       539
         bombing       0.99      0.98      0.98       455
  infrastructure       0.91      0.92      0.92       176
       sur

Epoch,Training Loss,Validation Loss,Hamming Loss,Subset Accuracy,Precision Micro,Recall Micro,F1 Micro,Precision Macro,Recall Macro,F1 Macro,Runtime,Samples Per Second,Steps Per Second
1,0.1548,0.13982,0.026978,0.849462,0.962874,0.880131,0.919645,0.951544,0.832531,0.882356,20.8199,71.47,1.825
2,0.0901,0.092384,0.021793,0.876344,0.959242,0.914614,0.936397,0.943639,0.896611,0.918414,20.8668,71.309,1.821
3,0.0711,0.084608,0.021025,0.881048,0.9563,0.922277,0.93898,0.942423,0.90602,0.923075,20.8859,71.244,1.819



Full Classification Report:
                  precision    recall  f1-score   support

          arrest       0.98      0.93      0.96       539
         bombing       0.99      0.98      0.98       455
  infrastructure       0.92      0.93      0.92       176
       surrender       0.93      0.78      0.85       172
         seizure       0.99      0.99      0.99        96
       abduction       0.91      0.90      0.91       311
incident_summary       0.88      0.83      0.86        78

       micro avg       0.96      0.92      0.94      1827
       macro avg       0.94      0.91      0.92      1827
    weighted avg       0.96      0.92      0.94      1827
     samples avg       0.95      0.94      0.94      1827




Full Classification Report:
                  precision    recall  f1-score   support

          arrest       0.96      0.93      0.95       550
         bombing       0.99      0.97      0.98       430
  infrastructure       0.84      0.91      0.87       170
       surrender       0.94      0.73      0.82       165
         seizure       1.00      0.99      1.00       113
       abduction       0.91      0.87      0.89       325
incident_summary       0.94      0.76      0.84        76

       micro avg       0.95      0.91      0.93      1829
       macro avg       0.94      0.88      0.91      1829
    weighted avg       0.95      0.91      0.93      1829
     samples avg       0.95      0.93      0.93      1829

Test Set Results: {'eval_loss': 0.09056778252124786, 'eval_hamming_loss': 0.025040775208673125, 'eval_subset_accuracy': 0.865010073875084, 'eval_precision_micro': 0.948512585812357, 'eval_recall_micro': 0.9065062875888463, 'eval_f1_micro': 0.927033827229522, 'eval_precisi

Full Classification Report:
                  precision    recall  f1-score   support

          arrest       0.96      0.93      0.95       550
         bombing       0.99      0.97      0.98       430
  infrastructure       0.84      0.91      0.87       170
       surrender       0.94      0.73      0.82       165
         seizure       1.00      0.99      1.00       113
       abduction       0.91      0.87      0.89       325
incident_summary       0.94      0.76      0.84        76

       micro avg       0.95      0.91      0.93      1829
       macro avg       0.94      0.88      0.91      1829
    weighted avg       0.95      0.91      0.93      1829
     samples avg       0.95      0.93      0.93      1829

   armed_assault
          arrest
         bombing
  infrastructure
       surrender
         seizure
       abduction


In [None]:

import os

# Save the model and tokenizer
model_path = "/content/drive/MyDrive/saved_models/actiontype/distilbert_model"  # Specify your desired path

if not os.path.exists(model_path):
    os.makedirs(model_path)


trainer_distilbert.save_model(model_path)
trainer_distilbert.tokenizer.save_pretrained(model_path)

print(f"Model saved to: {model_path}")
print(f"Tokenizer saved to: {model_path}")

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Model saved to: /content/drive/MyDrive/saved_models/actiontype/distilbert_model
Tokenizer saved to: /content/drive/MyDrive/saved_models/actiontype/distilbert_model
