## Load and inspect data

In [1]:
# import pandas
import pandas as pd

# load the data
csv_path = "data/action_type.csv"
action_df = pd.read_csv(csv_path)

In [2]:
# inspect the data
action_df.info()
action_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9921 entries, 0 to 9920
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   armed_assault     9921 non-null   int64 
 1   arrest            9921 non-null   int64 
 2   bombing           9921 non-null   int64 
 3   infrastructure    9921 non-null   int64 
 4   surrender         9921 non-null   int64 
 5   seizure           9921 non-null   int64 
 6   abduction         9921 non-null   int64 
 7   incident_summary  9921 non-null   object
dtypes: int64(7), object(1)
memory usage: 620.2+ KB


Unnamed: 0,armed_assault,arrest,bombing,infrastructure,surrender,seizure,abduction,incident_summary
0,0,1,0,0,0,0,0,An alleged arms supplier to the Communist Part...
1,0,0,0,0,1,0,0,A Kamareddy dalam (squad) member belonging to ...
2,0,1,0,0,0,0,0,Senior CPI-Maoist 'Polit Bureau' and 'central ...
3,1,0,0,0,0,0,0,A TDP leader and former Sarpanch of Jerrela Gr...
4,0,0,1,1,0,0,0,The CPI-Maoist cadres blasted coffee pulping u...


In [3]:
# check for missing values
action_df.isnull().sum()

armed_assault       0
arrest              0
bombing             0
infrastructure      0
surrender           0
seizure             0
abduction           0
incident_summary    0
dtype: int64

In [4]:
# inspect the frequencies of the different categories
label_data = action_df.iloc[:, :7] # save columns with labels in an object
frequencies = label_data.sum() / len(label_data) # calculate the frequency of each action

frequencies

armed_assault     0.360448
arrest            0.307529
bombing           0.115009
infrastructure    0.110775
surrender         0.078117
seizure           0.216510
abduction         0.047677
dtype: float64

## Apply train-test split

The classes are imbalanced, so we need to apply `MultilabelStratifiedShuffleSplit` from the `iterative-stratification` library.

In [5]:
# import MultiLabelStratifiedShuffleSplit from iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
import numpy as np

# stratified split to get training+validation and test datasets
msss_initial = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
labels_matrix = action_df[['armed_assault', 'arrest', 'bombing', 'infrastructure', 'surrender', 'seizure', 'abduction']].values

for train_val_index, test_index in msss_initial.split(np.zeros(len(action_df)), labels_matrix):
    train_val_df = action_df.iloc[train_val_index]
    test_df = action_df.iloc[test_index]

# further stratified split to divide training+validation into actual training and validation sets
msss_secondary = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2

labels_matrix_train_val = train_val_df[['armed_assault', 'arrest', 'bombing', 'infrastructure', 'surrender', 'seizure', 'abduction']].values

for train_index, val_index in msss_secondary.split(np.zeros(len(train_val_df)), labels_matrix_train_val):
    train_df = train_val_df.iloc[train_index]
    val_df = train_val_df.iloc[val_index]

In [6]:
# inspect training set
train_df.info()

# inspect the frequencies of the different categories
train_label_data = train_df.iloc[:, :7] # save columns with labels in an object
train_frequencies = train_label_data.sum() / len(train_label_data) # calculate the frequency of each action

train_frequencies


<class 'pandas.core.frame.DataFrame'>
Index: 5970 entries, 0 to 9920
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   armed_assault     5970 non-null   int64 
 1   arrest            5970 non-null   int64 
 2   bombing           5970 non-null   int64 
 3   infrastructure    5970 non-null   int64 
 4   surrender         5970 non-null   int64 
 5   seizure           5970 non-null   int64 
 6   abduction         5970 non-null   int64 
 7   incident_summary  5970 non-null   object
dtypes: int64(7), object(1)
memory usage: 419.8+ KB


armed_assault     0.359464
arrest            0.306700
bombing           0.114740
infrastructure    0.110385
surrender         0.077889
seizure           0.215913
abduction         0.047571
dtype: float64

In [7]:
# inspect validation set
val_df.info()

# inspect the frequencies of the different categories
val_label_data = val_df.iloc[:, :7] # save columns with labels in an object
val_frequencies = val_label_data.sum() / len(val_label_data) # calculate the frequency of each action

val_frequencies

<class 'pandas.core.frame.DataFrame'>
Index: 1970 entries, 1 to 9919
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   armed_assault     1970 non-null   int64 
 1   arrest            1970 non-null   int64 
 2   bombing           1970 non-null   int64 
 3   infrastructure    1970 non-null   int64 
 4   surrender         1970 non-null   int64 
 5   seizure           1970 non-null   int64 
 6   abduction         1970 non-null   int64 
 7   incident_summary  1970 non-null   object
dtypes: int64(7), object(1)
memory usage: 138.5+ KB


armed_assault     0.362944
arrest            0.309645
bombing           0.115736
infrastructure    0.111675
surrender         0.078680
seizure           0.217766
abduction         0.047716
dtype: float64

In [8]:
# inspect test set
test_df.info()

# inspect the frequencies of the different categories
test_label_data = test_df.iloc[:, :7] # save columns with labels in an object
test_frequencies = test_label_data.sum() / len(test_label_data) # calculate the frequency of each action

test_frequencies


<class 'pandas.core.frame.DataFrame'>
Index: 1981 entries, 5 to 9913
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   armed_assault     1981 non-null   int64 
 1   arrest            1981 non-null   int64 
 2   bombing           1981 non-null   int64 
 3   infrastructure    1981 non-null   int64 
 4   surrender         1981 non-null   int64 
 5   seizure           1981 non-null   int64 
 6   abduction         1981 non-null   int64 
 7   incident_summary  1981 non-null   object
dtypes: int64(7), object(1)
memory usage: 139.3+ KB


armed_assault     0.360929
arrest            0.307925
bombing           0.115093
infrastructure    0.111055
surrender         0.078243
seizure           0.217062
abduction         0.047956
dtype: float64

## Set up custom data loader

In [9]:
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

  from .autonotebook import tqdm as notebook_tqdm


## Tokenize and load data

In [10]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
max_length = 512

# Training set
texts = train_df['incident_summary'].tolist() # text data to list
labels = train_df.drop(['incident_summary'], axis=1).values.tolist() # labels to list
train_dataset = CustomDataset(texts, labels, tokenizer, max_length)

# Validation set
texts = val_df['incident_summary'].tolist()
labels = val_df.drop(['incident_summary'], axis=1).values.tolist() # labels to list
val_dataset = CustomDataset(texts, labels, tokenizer, max_length)

# Test set
texts = test_df['incident_summary'].tolist()
labels = test_df.drop(['incident_summary'], axis=1).values.tolist() # labels to list
test_dataset = CustomDataset(texts, labels, tokenizer, max_length)

## Inspect training, validation and test datasets

In [14]:
# Inspect an example of the training set
import torch
print(train_dataset[1])


{'input_ids': tensor([  101,  1996, 28780,  1011, 15158,  2923, 28353,  6072, 18461,  4157,
        16016,  2075,  3197,  2012, 17170,  9692, 19736,  8943,  1998, 21877,
        29045,  7911,  3736,  4731,  1999,  1043,  1012,  1047,  1012,  2310,
         2098,  4048, 24373,  1998,  2165,  2185,  3053,  8698,  8641,  1997,
        21976,  4157, 13435,  2007,  2068,  1012,  1996,  2886,  2001,  3344,
         2041,  1999,  6186,  2114,  1996,  2285,  2676,  1010,  2294,  1011,
         4288,  1997,  2049,  2327,  4177,  1010, 11333,  2094,  2912,  5311,
        16469,  5302, 15859,  1998,  2010,  2564, 10556, 26605,  1010,  1999,
         1996,  4034,  2181,  1012,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

In [15]:
# Inspect an example of the validation set
import torch
print(val_dataset[1])

{'input_ids': tensor([  101,  3026, 28780,  1011, 15158,  2923,  1005, 14955,  4183,  4879,
         1005,  1998,  1005,  2430,  2837,  1005,  2266,  1998,  2708,  1997,
         2049,  1005,  2430,  1011,  2789,  3164,  4879,  1005,  8822,  1996,
         2576,  3821,  1999,  2030, 21205,  1998, 10381, 12707,  7315, 13484,
         1010,  1038,  1012, 24331,  2624, 21095, 14593,  6583,  6371,  1010,
         2003,  4727,  2012,  1038, 16102, 22648, 19531,  2213,  1999,  1996,
         1047,  3511,  2863,  2213,  2212,  1012,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

In [16]:
# Inspect an example of the test set
import torch
print(test_dataset[1])

{'input_ids': tensor([  101, 28780,  1011, 15158,  2923, 28353,  6072,  2128,  9080, 15070,
         2000,  1996,  4288,  1997,  2028,  1997,  2049,  3026,  4177, 21146,
        27362,  1998,  1037,  8396,  2266,  2379, 12849,  6820, 19648,  2850,
        17137,  5149,  2006,  2255,  2539,  1010,  2297,  1010,  2044,  2009,
         2730,  1037,  3128,  1997,  2310,  6906, 24516,  2213,  2352,  2624,
        16963,  3567, 10546,  1010,  2011,  9846,  2019,  6683,  6444,  1997,
         2624, 16963,  3567, 10546,  1005,  1055,  6259, 11972, 21934,  3270,
        18598,  3286,  2012, 19739, 11335,  2213,  3726,  2098,  4048,  2352,
         1999,  1043,  1012,  5506, 15916,  7068, 24373,  3831,  3131,  1012,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

## Run the model

In [17]:
# import libraries
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import hamming_loss

# define model
num_labels = 7
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = num_labels)

# add evaluation metrics

def compute_metrics(pred):
    labels = pred.label_ids
    preds = (pred.predictions > 0.5).astype(int)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='samples')
    acc = accuracy_score(labels, preds)
    ham_loss = hamming_loss(labels, preds)  # Calculate Hamming Loss
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'hamming_loss': ham_loss  # Include Hamming Loss in the metrics
    }

# specify training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy='epoch',  # Evaluate at the end of each epoch
)

# specify training args
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics  
)

# train the model
trainer.train()

# evaluate the model on the validation set
trainer.evaluate()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
 11%|█         | 500/4479 [02:52<22:41,  2.92it/s]

{'loss': 0.3267, 'grad_norm': 0.8730411529541016, 'learning_rate': 5e-05, 'epoch': 0.33}


 22%|██▏       | 1000/4479 [05:46<21:04,  2.75it/s]

{'loss': 0.1049, 'grad_norm': 3.9102416038513184, 'learning_rate': 4.3717014325207336e-05, 'epoch': 0.67}


  _warn_prf(average, modifier, msg_start, len(result))
                                                   
 33%|███▎      | 1493/4479 [09:28<18:52,  2.64it/s]

{'eval_loss': 0.08755964040756226, 'eval_accuracy': 0.867005076142132, 'eval_f1': 0.93165820642978, 'eval_precision': 0.9467851099830794, 'eval_recall': 0.9313028764805413, 'eval_hamming_loss': 0.025815808556925307, 'eval_runtime': 50.997, 'eval_samples_per_second': 38.63, 'eval_steps_per_second': 9.667, 'epoch': 1.0}


 33%|███▎      | 1500/4479 [09:31<1:47:20,  2.16s/it] 

{'loss': 0.0765, 'grad_norm': 0.106341652572155, 'learning_rate': 3.7434028650414677e-05, 'epoch': 1.0}


 45%|████▍     | 2000/4479 [12:25<14:22,  2.87it/s]  

{'loss': 0.0668, 'grad_norm': 0.5648437142372131, 'learning_rate': 3.115104297562202e-05, 'epoch': 1.34}


 56%|█████▌    | 2500/4479 [15:19<11:32,  2.86it/s]

{'loss': 0.0603, 'grad_norm': 0.649854838848114, 'learning_rate': 2.4868057300829358e-05, 'epoch': 1.67}


  _warn_prf(average, modifier, msg_start, len(result))
                                                   
 67%|██████▋   | 2986/4479 [18:55<07:25,  3.35it/s]

{'eval_loss': 0.0840606763958931, 'eval_accuracy': 0.8725888324873097, 'eval_f1': 0.9354653130287648, 'eval_precision': 0.9466159052453468, 'eval_recall': 0.938663282571912, 'eval_hamming_loss': 0.023640319071791153, 'eval_runtime': 47.0845, 'eval_samples_per_second': 41.84, 'eval_steps_per_second': 10.471, 'epoch': 2.0}


 67%|██████▋   | 3000/4479 [19:00<11:49,  2.08it/s]  

{'loss': 0.0645, 'grad_norm': 0.10161740332841873, 'learning_rate': 1.8585071626036695e-05, 'epoch': 2.01}


 78%|███████▊  | 3500/4479 [21:52<05:31,  2.95it/s]

{'loss': 0.0375, 'grad_norm': 0.14092226326465607, 'learning_rate': 1.2302085951244032e-05, 'epoch': 2.34}


 89%|████████▉ | 4000/4479 [24:44<02:41,  2.96it/s]

{'loss': 0.0334, 'grad_norm': 2.128748655319214, 'learning_rate': 6.01910027645137e-06, 'epoch': 2.68}


  _warn_prf(average, modifier, msg_start, len(result))
                                                   
100%|██████████| 4479/4479 [28:16<00:00,  2.64it/s]


{'eval_loss': 0.07924684137105942, 'eval_accuracy': 0.882741116751269, 'eval_f1': 0.9403553299492385, 'eval_precision': 0.9507614213197969, 'eval_recall': 0.9429780033840947, 'eval_hamming_loss': 0.02189992748368383, 'eval_runtime': 47.1804, 'eval_samples_per_second': 41.755, 'eval_steps_per_second': 10.449, 'epoch': 3.0}
{'train_runtime': 1696.009, 'train_samples_per_second': 10.56, 'train_steps_per_second': 2.641, 'train_loss': 0.09086231249574839, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 493/493 [00:47<00:00, 10.32it/s]


{'eval_loss': 0.07924684137105942,
 'eval_accuracy': 0.882741116751269,
 'eval_f1': 0.9403553299492385,
 'eval_precision': 0.9507614213197969,
 'eval_recall': 0.9429780033840947,
 'eval_hamming_loss': 0.02189992748368383,
 'eval_runtime': 47.8796,
 'eval_samples_per_second': 41.145,
 'eval_steps_per_second': 10.297,
 'epoch': 3.0}

## Evaluate model on test dataset

In [18]:
trainer.evaluate(test_dataset)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 496/496 [00:48<00:00, 10.22it/s]


{'eval_loss': 0.07873371243476868,
 'eval_accuracy': 0.9015648662291772,
 'eval_f1': 0.9452633350159851,
 'eval_precision': 0.9543160020191822,
 'eval_recall': 0.9464075382803298,
 'eval_hamming_loss': 0.019470685800822094,
 'eval_runtime': 49.3641,
 'eval_samples_per_second': 40.13,
 'eval_steps_per_second': 10.048,
 'epoch': 3.0}

## Save the model

In [19]:
model.save_pretrained('/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/action-type')
tokenizer.save_pretrained('/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/action-type')

('/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/action-type/tokenizer_config.json',
 '/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/action-type/special_tokens_map.json',
 '/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/action-type/vocab.txt',
 '/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/action-type/added_tokens.json')

## To use the model again

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained('/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/action-type')
tokenizer = AutoTokenizer.from_pretrained('/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/action-type')