<a href="https://colab.research.google.com/github/eteitelbaum/code-satp/blob/Fall-2024/training-targettype-distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, hamming_loss, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# =======================
# Generalized Dataset Class
# =======================
class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.float),
        }

# =======================
# Function to Compute Metrics
# =======================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (torch.sigmoid(torch.tensor(logits)) > 0.5).numpy()  # Apply threshold
    labels = labels.astype(int)

    hamming = hamming_loss(labels, predictions)
    subset_acc = accuracy_score(labels, predictions)
    report = classification_report(labels, predictions, output_dict=True, zero_division=0)

    return {
        "hamming_loss": hamming,
        "subset_accuracy": subset_acc,
        "precision_micro": report["micro avg"]["precision"],
        "recall_micro": report["micro avg"]["recall"],
        "f1_micro": report["micro avg"]["f1-score"],
    }

from sklearn.metrics import classification_report, hamming_loss, accuracy_score

def compute_metricss(eval_pred):
    """
    Compute evaluation metrics for multi-label classification.
    Includes Hamming Loss, Subset Accuracy, and Classification Report for all labels.
    """
    logits, labels = eval_pred
    predictions = (torch.sigmoid(torch.tensor(logits)) > 0.5).numpy()  # Apply threshold
    labels = labels.astype(int)

    # Hamming Loss
    hamming = hamming_loss(labels, predictions)

    # Subset Accuracy
    subset_acc = accuracy_score(labels, predictions)

    # Classification Report
    report = classification_report(
        labels, predictions, target_names=data.columns[1:], zero_division=0, output_dict=True
    )

    # Print complete report for reference
    print("\nFull Classification Report:")
    print(classification_report(labels, predictions, target_names=data.columns[:-1], zero_division=0))

    # Summary Metrics for Trainer
    return {
        "hamming_loss": hamming,
        "subset_accuracy": subset_acc,
        "precision_micro": report["micro avg"]["precision"],
        "recall_micro": report["micro avg"]["recall"],
        "f1_micro": report["micro avg"]["f1-score"],
        "precision_macro": report["macro avg"]["precision"],
        "recall_macro": report["macro avg"]["recall"],
        "f1_macro": report["macro avg"]["f1-score"],
    }


# =======================
# Reusable Training Function
# =======================
def train_transformer_model(model_name, data, max_len=512, batch_size=40, epochs=3):
    """
    Generalized function to train a transformer model for multi-label classification.
    Args:
        model_name: Name of the pre-trained model (e.g., "bert-base-uncased", "distilbert-base-uncased").
        data: Pandas DataFrame with columns "incident_summary" and multi-label columns.
        max_len: Maximum sequence length.
        batch_size: Batch size for training and evaluation.
        epochs: Number of training epochs.
    """
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=data.shape[1] - 1,  # Number of labels (all columns except "incident_summary")
        problem_type="multi_label_classification",
    )
    model.to("cuda" if torch.cuda.is_available() else "cpu")

    # Split data into train, val, and test
    X = data["incident_summary"]
    y = data.drop('incident_summary', axis=1).values

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42) #stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Create datasets
    train_dataset = MultiLabelDataset(X_train.tolist(), y_train, tokenizer, max_len)
    val_dataset = MultiLabelDataset(X_val.tolist(), y_val, tokenizer, max_len)
    test_dataset = MultiLabelDataset(X_test.tolist(), y_test, tokenizer, max_len)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="f1_micro",
        greater_is_better=True,
        save_total_limit=2,
        report_to="none",
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metricss,
    )

    # Train and Evaluate
    trainer.train()

    # Final Evaluation on Test Set
    test_results = trainer.evaluate(test_dataset)
    print("Test Set Results:", test_results)

    return trainer, test_results, model, tokenizer


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/SATP_data/target_type_filtered.csv')

In [None]:
# Combine 'government_officials' and 'government_infrastructure' into 'government'
data['government'] = data['government_officials'] | data['government_infrastructure']
data = data.drop(columns=['government_officials','private_property', 'government_infrastructure', 'mining_company','ngos', 'non_maoist_armed_group'])
data.columns

Index(['civilians', 'maoist', 'security', 'no_target', 'incident_summary',
       'government'],
      dtype='object')

In [None]:
# same for 'private_property', 'mining_company' to one column as private_property

# Combine 'government_officials' and 'government_infrastructure' into 'government'
data['government'] = data['government_officials'] | data['government_infrastructure']

# Combine 'private_property' and 'mining_company' into 'private_property'
data['private_property'] = data['private_property'] | data['mining_company']

# Drop the original columns
data = data.drop(columns=['government_officials', 'government_infrastructure', 'mining_company','ngos', 'non_maoist_armed_group'])

data.columns

Index(['civilians', 'maoist', 'security', 'private_property', 'no_target',
       'incident_summary', 'government'],
      dtype='object')

In [None]:

# Move 'incident_summary' column to the last position
cols = list(data.columns)
cols.remove('incident_summary')
cols.append('incident_summary')
data = data[cols]
data.columns

Index(['civilians', 'maoist', 'security', 'no_target', 'government',
       'incident_summary'],
      dtype='object')

In [None]:
# Drop rows where all specified columns have a value of 0
data = data[~(data[['civilians', 'maoist', 'government', 'security', 'no_target']]==0).all(axis=1)]
data

Unnamed: 0,civilians,maoist,security,no_target,government,incident_summary
0,1,0,0,0,0,An alleged arms supplier to the Communist Part...
1,0,0,0,1,0,A Kamareddy dalam (squad) member belonging to ...
2,0,1,0,0,0,Senior CPI-Maoist 'Polit Bureau' and 'central ...
3,0,0,0,0,1,A TDP leader and former Sarpanch of Jerrela Gr...
5,0,1,0,0,0,Police recovered about INR 32 lakh cash from t...
...,...,...,...,...,...,...
9915,0,1,0,0,0,"A suspected Maoist, Lakshiram Soren, was arres..."
9916,0,1,0,0,0,Suspected cadres of the CPI-Maoist abducted fi...
9918,0,0,1,0,1,The CPI-Maoist cadres detonated a landmine tar...
9919,1,0,0,0,0,Maoists allegedly killed two CPI-M supporters ...


In [None]:

# Train using DistilBERT 2
trainer_distilbert, results_distilbert, model, tokenizer = train_transformer_model("distilbert-base-uncased", data)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Hamming Loss,Subset Accuracy,Precision Micro,Recall Micro,F1 Micro,Precision Macro,Recall Macro,F1 Macro,Runtime,Samples Per Second,Steps Per Second
1,0.2454,0.229446,0.082096,0.626181,0.846033,0.668529,0.746879,0.809781,0.576958,0.643535,20.8707,71.009,1.821
2,0.162,0.16979,0.058367,0.765182,0.870924,0.795779,0.831657,0.832304,0.745048,0.785478,20.9394,70.776,1.815
3,0.1563,0.15754,0.054543,0.782051,0.881954,0.806952,0.842788,0.844786,0.751456,0.794627,21.1282,70.143,1.799



Full Classification Report:
                  precision    recall  f1-score   support

       civilians       0.89      0.33      0.48       238
          maoist       0.96      0.90      0.93       681
        security       0.83      0.27      0.41       149
private_property       0.61      0.70      0.65       117
       no_target       0.99      0.71      0.83       183
      government       0.57      0.55      0.56       243

       micro avg       0.85      0.67      0.75      1611
       macro avg       0.81      0.58      0.64      1611
    weighted avg       0.86      0.67      0.73      1611
     samples avg       0.67      0.69      0.68      1611


Full Classification Report:
                  precision    recall  f1-score   support

       civilians       0.82      0.71      0.76       238
          maoist       0.96      0.91      0.93       681
        security       0.90      0.74      0.82       149
private_property       0.74      0.62      0.68       117
       no_


Full Classification Report:
                  precision    recall  f1-score   support

       civilians       0.87      0.78      0.82       241
          maoist       0.96      0.93      0.94       696
        security       0.93      0.76      0.84       151
private_property       0.72      0.59      0.64        99
       no_target       0.92      0.83      0.87       207
      government       0.73      0.74      0.73       186

       micro avg       0.89      0.83      0.86      1580
       macro avg       0.85      0.77      0.81      1580
    weighted avg       0.89      0.83      0.86      1580
     samples avg       0.87      0.85      0.86      1580

Test Set Results: {'eval_loss': 0.1450628936290741, 'eval_hamming_loss': 0.04745838956365272, 'eval_subset_accuracy': 0.8130904183535762, 'eval_precision_micro': 0.8933423913043478, 'eval_recall_micro': 0.8322784810126582, 'eval_f1_micro': 0.8617300131061599, 'eval_precision_macro': 0.8524822701168581, 'eval_recall_macro': 0.770

In [None]:

# Train using DistilBERT 3
trainer_distilbert, results_distilbert, model, tokenizer = train_transformer_model("distilbert-base-uncased", data)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Hamming Loss,Subset Accuracy,Precision Micro,Recall Micro,F1 Micro,Precision Macro,Recall Macro,F1 Macro,Runtime,Samples Per Second,Steps Per Second
1,0.216,0.200653,0.070031,0.785933,0.858191,0.787584,0.821373,0.661848,0.605809,0.620384,18.4633,70.843,1.787
2,0.1655,0.159864,0.054434,0.831804,0.893344,0.833209,0.862229,0.702358,0.688332,0.692089,18.5954,70.34,1.775
3,0.1254,0.151883,0.055352,0.832569,0.888446,0.833957,0.86034,0.696682,0.683649,0.68846,18.6932,69.972,1.765



Full Classification Report:
                      precision    recall  f1-score   support

           civilians       0.64      0.82      0.72       237
              maoist       0.95      0.92      0.94       687
government_officials       0.00      0.00      0.00        81
            security       0.80      0.44      0.57       139
           no_target       0.92      0.84      0.88       193

           micro avg       0.86      0.79      0.82      1337
           macro avg       0.66      0.61      0.62      1337
        weighted avg       0.82      0.79      0.79      1337
         samples avg       0.80      0.80      0.80      1337


Full Classification Report:
                      precision    recall  f1-score   support

           civilians       0.70      0.87      0.78       237
              maoist       0.97      0.92      0.94       687
government_officials       0.00      0.00      0.00        81
            security       0.91      0.80      0.85       139
        


Full Classification Report:
                      precision    recall  f1-score   support

           civilians       0.72      0.87      0.79       247
              maoist       0.98      0.92      0.95       665
government_officials       0.00      0.00      0.00        75
            security       0.89      0.78      0.83       161
           no_target       0.90      0.87      0.89       197

           micro avg       0.90      0.84      0.87      1345
           macro avg       0.70      0.69      0.69      1345
        weighted avg       0.85      0.84      0.84      1345
         samples avg       0.86      0.85      0.85      1345

Test Set Results: {'eval_loss': 0.1583736687898636, 'eval_hamming_loss': 0.05362872421695951, 'eval_subset_accuracy': 0.8342245989304813, 'eval_precision_micro': 0.8957006369426752, 'eval_recall_micro': 0.8364312267657993, 'eval_f1_micro': 0.8650519031141869, 'eval_precision_macro': 0.6985989236425801, 'eval_recall_macro': 0.6890847358451815, 'ev

In [None]:

# Train using DistilBERT 4
trainer_distilbert, results_distilbert, model, tokenizer = train_transformer_model("distilbert-base-uncased", data)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Hamming Loss,Subset Accuracy,Precision Micro,Recall Micro,F1 Micro,Precision Macro,Recall Macro,F1 Macro,Runtime,Samples Per Second,Steps Per Second
1,0.2289,0.20875,0.065223,0.787011,0.888973,0.784564,0.833512,0.866556,0.736174,0.793887,20.474,69.942,1.758
2,0.1678,0.158805,0.04986,0.840084,0.910805,0.842953,0.875566,0.892647,0.811246,0.848179,20.1639,71.018,1.785
3,0.1321,0.147986,0.047067,0.84567,0.909737,0.85906,0.883673,0.895333,0.831908,0.860804,20.2978,70.549,1.774



Full Classification Report:
              precision    recall  f1-score   support

   civilians       0.75      0.70      0.72       252
      maoist       0.96      0.89      0.92       673
    security       0.92      0.66      0.77       148
   no_target       0.92      0.82      0.86       195
  government       0.78      0.62      0.69       222

   micro avg       0.89      0.78      0.83      1490
   macro avg       0.87      0.74      0.79      1490
weighted avg       0.89      0.78      0.83      1490
 samples avg       0.82      0.80      0.81      1490


Full Classification Report:
              precision    recall  f1-score   support

   civilians       0.91      0.70      0.79       252
      maoist       0.96      0.92      0.94       673
    security       0.91      0.80      0.85       148
   no_target       0.91      0.84      0.87       195
  government       0.78      0.79      0.79       222

   micro avg       0.91      0.84      0.88      1490
   macro avg       


Full Classification Report:
              precision    recall  f1-score   support

   civilians       0.89      0.81      0.85       230
      maoist       0.96      0.92      0.94       675
    security       0.86      0.82      0.84       146
   no_target       0.92      0.82      0.86       211
  government       0.77      0.81      0.79       213

   micro avg       0.90      0.86      0.88      1475
   macro avg       0.88      0.83      0.86      1475
weighted avg       0.90      0.86      0.88      1475
 samples avg       0.88      0.87      0.87      1475

Test Set Results: {'eval_loss': 0.14528632164001465, 'eval_hamming_loss': 0.04748603351955307, 'eval_subset_accuracy': 0.854050279329609, 'eval_precision_micro': 0.9033404406538735, 'eval_recall_micro': 0.8616949152542372, 'eval_f1_micro': 0.8820263705759889, 'eval_precision_macro': 0.8800719794354868, 'eval_recall_macro': 0.8335846481345065, 'eval_f1_macro': 0.8555377745738193, 'eval_runtime': 20.4258, 'eval_samples_per_sec

In [None]:

# Train using DistilBERT 5
trainer_distilbert, results_distilbert, model, tokenizer = train_transformer_model("bert-base-uncased", data)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Hamming Loss,Subset Accuracy,Precision Micro,Recall Micro,F1 Micro,Precision Macro,Recall Macro,F1 Macro,Runtime,Samples Per Second,Steps Per Second
1,0.2187,0.212354,0.071346,0.760608,0.884502,0.755791,0.815096,0.852131,0.684715,0.750666,53.0045,36.016,0.906
2,0.1668,0.159226,0.048821,0.84023,0.908163,0.85146,0.878898,0.880984,0.832452,0.855715,53.1589,35.911,0.903
3,0.1562,0.148605,0.048088,0.843373,0.899111,0.866062,0.882278,0.879139,0.847057,0.861624,53.3699,35.769,0.899



Full Classification Report:
              precision    recall  f1-score   support

   civilians       0.72      0.66      0.69       309
      maoist       0.95      0.91      0.93       908
    security       0.87      0.56      0.68       209
   no_target       0.88      0.83      0.86       264
  government       0.84      0.47      0.61       296

   micro avg       0.88      0.76      0.82      1986
   macro avg       0.85      0.68      0.75      1986
weighted avg       0.88      0.76      0.81      1986
 samples avg       0.79      0.77      0.78      1986


Full Classification Report:
              precision    recall  f1-score   support

   civilians       0.85      0.82      0.84       309
      maoist       0.97      0.90      0.93       908
    security       0.85      0.86      0.85       209
   no_target       0.91      0.85      0.88       264
  government       0.82      0.74      0.78       296

   micro avg       0.91      0.85      0.88      1986
   macro avg       


Full Classification Report:
              precision    recall  f1-score   support

   civilians       0.87      0.83      0.85       317
      maoist       0.96      0.93      0.94       921
    security       0.87      0.83      0.85       174
   no_target       0.90      0.83      0.87       276
  government       0.79      0.83      0.81       282

   micro avg       0.90      0.88      0.89      1970
   macro avg       0.88      0.85      0.86      1970
weighted avg       0.90      0.88      0.89      1970
 samples avg       0.89      0.89      0.89      1970

Test Set Results: {'eval_loss': 0.14523105323314667, 'eval_hamming_loss': 0.045154531168150866, 'eval_subset_accuracy': 0.8601361969617601, 'eval_precision_micro': 0.902668759811617, 'eval_recall_micro': 0.8756345177664975, 'eval_f1_micro': 0.8889461479000258, 'eval_precision_macro': 0.8783216267084161, 'eval_recall_macro': 0.8486167272479304, 'eval_f1_macro': 0.8628155713394839, 'eval_runtime': 53.1933, 'eval_samples_per_se

In [None]:
#final
# Train using DistilBERT 6
trainer_distilbert, results_distilbert, model, tokenizer = train_transformer_model("distilbert-base-uncased", data)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Hamming Loss,Subset Accuracy,Precision Micro,Recall Micro,F1 Micro,Precision Macro,Recall Macro,F1 Macro,Runtime,Samples Per Second,Steps Per Second
1,0.2385,0.235409,0.079518,0.720272,0.880347,0.715005,0.789108,0.849413,0.623608,0.710275,27.0779,70.5,1.773
2,0.1821,0.172321,0.054898,0.819801,0.903867,0.823766,0.86196,0.878024,0.791327,0.830402,26.9199,70.914,1.783
3,0.181,0.159064,0.049974,0.835516,0.907178,0.846425,0.875749,0.885465,0.818504,0.850506,27.1034,70.434,1.771



Full Classification Report:
              precision    recall  f1-score   support

   civilians       0.73      0.57      0.64       309
      maoist       0.92      0.91      0.92       908
    security       0.81      0.50      0.62       209
   no_target       0.96      0.72      0.83       264
  government       0.83      0.42      0.55       296

   micro avg       0.88      0.72      0.79      1986
   macro avg       0.85      0.62      0.71      1986
weighted avg       0.87      0.72      0.78      1986
 samples avg       0.74      0.73      0.74      1986


Full Classification Report:
              precision    recall  f1-score   support

   civilians       0.84      0.79      0.82       309
      maoist       0.95      0.90      0.93       908
    security       0.82      0.82      0.82       209
   no_target       0.94      0.82      0.88       264
  government       0.84      0.62      0.71       296

   micro avg       0.90      0.82      0.86      1986
   macro avg       


Full Classification Report:
              precision    recall  f1-score   support

   civilians       0.84      0.81      0.82       317
      maoist       0.95      0.94      0.94       921
    security       0.86      0.78      0.82       174
   no_target       0.92      0.80      0.85       276
  government       0.80      0.71      0.75       282

   micro avg       0.90      0.85      0.87      1970
   macro avg       0.87      0.81      0.84      1970
weighted avg       0.90      0.85      0.87      1970
 samples avg       0.87      0.86      0.86      1970

Test Set Results: {'eval_loss': 0.15377189218997955, 'eval_hamming_loss': 0.05028810895756941, 'eval_subset_accuracy': 0.8397066526977475, 'eval_precision_micro': 0.9001074113856069, 'eval_recall_micro': 0.8507614213197969, 'eval_f1_micro': 0.8747390396659708, 'eval_precision_macro': 0.8742883409437663, 'eval_recall_macro': 0.8072299284026118, 'eval_f1_macro': 0.8389272033125547, 'eval_runtime': 27.0615, 'eval_samples_per_se

In [None]:
# Save the model and tokenizer
model.save_pretrained('/content/drive/MyDrive/SATP_data/target_type/distilBert')
tokenizer.save_pretrained('/content/drive/MyDrive/SATP_data/target_type/distilBert')


('/content/drive/MyDrive/SATP_data/target_type/distilBert/tokenizer_config.json',
 '/content/drive/MyDrive/SATP_data/target_type/distilBert/special_tokens_map.json',
 '/content/drive/MyDrive/SATP_data/target_type/distilBert/vocab.txt',
 '/content/drive/MyDrive/SATP_data/target_type/distilBert/added_tokens.json',
 '/content/drive/MyDrive/SATP_data/target_type/distilBert/tokenizer.json')