## Load and inspect data

In [1]:
# import pandas
import pandas as pd

# load the data
csv_path = "data/action_type.csv"
action_df = pd.read_csv(csv_path)

In [2]:
# inspect the data
action_df.info()
action_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9921 entries, 0 to 9920
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   armed_assault     9921 non-null   int64 
 1   arrest            9921 non-null   int64 
 2   bombing           9921 non-null   int64 
 3   infrastructure    9921 non-null   int64 
 4   surrender         9921 non-null   int64 
 5   seizure           9921 non-null   int64 
 6   abduction         9921 non-null   int64 
 7   incident_summary  9921 non-null   object
dtypes: int64(7), object(1)
memory usage: 620.2+ KB


Unnamed: 0,armed_assault,arrest,bombing,infrastructure,surrender,seizure,abduction,incident_summary
0,0,1,0,0,0,0,0,An alleged arms supplier to the Communist Part...
1,0,0,0,0,1,0,0,A Kamareddy dalam (squad) member belonging to ...
2,0,1,0,0,0,0,0,Senior CPI-Maoist 'Polit Bureau' and 'central ...
3,1,0,0,0,0,0,0,A TDP leader and former Sarpanch of Jerrela Gr...
4,0,0,1,1,0,0,0,The CPI-Maoist cadres blasted coffee pulping u...


In [3]:
# check for missing values
action_df.isnull().sum()

armed_assault       0
arrest              0
bombing             0
infrastructure      0
surrender           0
seizure             0
abduction           0
incident_summary    0
dtype: int64

In [4]:
# inspect the frequencies of the different categories
label_data = action_df.iloc[:, :7] # save columns with labels in an object
frequencies = label_data.sum() / len(label_data) # calculate the frequency of each action

frequencies

armed_assault     0.360448
arrest            0.307529
bombing           0.115009
infrastructure    0.110775
surrender         0.078117
seizure           0.216510
abduction         0.047677
dtype: float64

## Add labels column to data frame

In [5]:
# take values of all columns except the incident summary and combine them into a new vector/list
action_df['labels'] = action_df[['armed_assault', 
                                 'arrest', 
                                 'bombing', 
                                 'infrastructure', 
                                 'surrender', 
                                 'seizure', 
                                 'abduction']].values.tolist()

# inspect
action_df.info()
action_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9921 entries, 0 to 9920
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   armed_assault     9921 non-null   int64 
 1   arrest            9921 non-null   int64 
 2   bombing           9921 non-null   int64 
 3   infrastructure    9921 non-null   int64 
 4   surrender         9921 non-null   int64 
 5   seizure           9921 non-null   int64 
 6   abduction         9921 non-null   int64 
 7   incident_summary  9921 non-null   object
 8   labels            9921 non-null   object
dtypes: int64(7), object(2)
memory usage: 697.7+ KB


Unnamed: 0,armed_assault,arrest,bombing,infrastructure,surrender,seizure,abduction,incident_summary,labels
0,0,1,0,0,0,0,0,An alleged arms supplier to the Communist Part...,"[0, 1, 0, 0, 0, 0, 0]"
1,0,0,0,0,1,0,0,A Kamareddy dalam (squad) member belonging to ...,"[0, 0, 0, 0, 1, 0, 0]"
2,0,1,0,0,0,0,0,Senior CPI-Maoist 'Polit Bureau' and 'central ...,"[0, 1, 0, 0, 0, 0, 0]"
3,1,0,0,0,0,0,0,A TDP leader and former Sarpanch of Jerrela Gr...,"[1, 0, 0, 0, 0, 0, 0]"
4,0,0,1,1,0,0,0,The CPI-Maoist cadres blasted coffee pulping u...,"[0, 0, 1, 1, 0, 0, 0]"


## Apply train-test split

The classes are imbalanced, so we need to apply `MultilabelStratifiedShuffleSplit` from the `iterative-stratification` library.

In [6]:
# import MultiLabelStratifiedShuffleSplit from iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
import numpy as np

# stratified split to get training+validation and test datasets
msss_initial = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
labels_matrix = action_df[['armed_assault', 'arrest', 'bombing', 'infrastructure', 'surrender', 'seizure', 'abduction']].values

for train_val_index, test_index in msss_initial.split(np.zeros(len(action_df)), labels_matrix):
    train_val_df = action_df.iloc[train_val_index]
    test_df = action_df.iloc[test_index]

# further stratified split to divide training+validation into actual training and validation sets
msss_secondary = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2

labels_matrix_train_val = train_val_df[['armed_assault', 'arrest', 'bombing', 'infrastructure', 'surrender', 'seizure', 'abduction']].values

for train_index, val_index in msss_secondary.split(np.zeros(len(train_val_df)), labels_matrix_train_val):
    train_df = train_val_df.iloc[train_index]
    val_df = train_val_df.iloc[val_index]

# 

In [7]:
# inspect training set
train_df.info()

# inspect the frequencies of the different categories
train_label_data = train_df.iloc[:, :7] # save columns with labels in an object
train_frequencies = train_label_data.sum() / len(train_label_data) # calculate the frequency of each action

train_frequencies


<class 'pandas.core.frame.DataFrame'>
Index: 5970 entries, 0 to 9920
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   armed_assault     5970 non-null   int64 
 1   arrest            5970 non-null   int64 
 2   bombing           5970 non-null   int64 
 3   infrastructure    5970 non-null   int64 
 4   surrender         5970 non-null   int64 
 5   seizure           5970 non-null   int64 
 6   abduction         5970 non-null   int64 
 7   incident_summary  5970 non-null   object
 8   labels            5970 non-null   object
dtypes: int64(7), object(2)
memory usage: 466.4+ KB


armed_assault     0.359464
arrest            0.306700
bombing           0.114740
infrastructure    0.110385
surrender         0.077889
seizure           0.215913
abduction         0.047571
dtype: float64

In [8]:
# inspect validation set
val_df.info()

# inspect the frequencies of the different categories
val_label_data = val_df.iloc[:, :7] # save columns with labels in an object
val_frequencies = val_label_data.sum() / len(val_label_data) # calculate the frequency of each action

val_frequencies

<class 'pandas.core.frame.DataFrame'>
Index: 1970 entries, 1 to 9919
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   armed_assault     1970 non-null   int64 
 1   arrest            1970 non-null   int64 
 2   bombing           1970 non-null   int64 
 3   infrastructure    1970 non-null   int64 
 4   surrender         1970 non-null   int64 
 5   seizure           1970 non-null   int64 
 6   abduction         1970 non-null   int64 
 7   incident_summary  1970 non-null   object
 8   labels            1970 non-null   object
dtypes: int64(7), object(2)
memory usage: 153.9+ KB


armed_assault     0.362944
arrest            0.309645
bombing           0.115736
infrastructure    0.111675
surrender         0.078680
seizure           0.217766
abduction         0.047716
dtype: float64

In [9]:
# inspect test set
test_df.info()

# inspect the frequencies of the different categories
test_label_data = test_df.iloc[:, :7] # save columns with labels in an object
test_frequencies = test_label_data.sum() / len(test_label_data) # calculate the frequency of each action

test_frequencies


<class 'pandas.core.frame.DataFrame'>
Index: 1981 entries, 5 to 9913
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   armed_assault     1981 non-null   int64 
 1   arrest            1981 non-null   int64 
 2   bombing           1981 non-null   int64 
 3   infrastructure    1981 non-null   int64 
 4   surrender         1981 non-null   int64 
 5   seizure           1981 non-null   int64 
 6   abduction         1981 non-null   int64 
 7   incident_summary  1981 non-null   object
 8   labels            1981 non-null   object
dtypes: int64(7), object(2)
memory usage: 154.8+ KB


armed_assault     0.360929
arrest            0.307925
bombing           0.115093
infrastructure    0.111055
surrender         0.078243
seizure           0.217062
abduction         0.047956
dtype: float64

## Tokenize textual data

In [10]:
from datasets import Dataset
from transformers import AutoTokenizer

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Function to tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples['incident_summary'], padding="max_length", truncation=True)

# Convert to Hugging Face Dataset objects and apply tokenization
train_dataset = Dataset.from_pandas(train_df).map(tokenize_function, batched=True)
val_dataset = Dataset.from_pandas(val_df).map(tokenize_function, batched=True)
test_dataset = Dataset.from_pandas(test_df).map(tokenize_function, batched=True)

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 5970/5970 [00:00<00:00, 10812.61 examples/s]
Map: 100%|██████████| 1970/1970 [00:00<00:00, 8110.77 examples/s]
Map: 100%|██████████| 1981/1981 [00:00<00:00, 11411.54 examples/s]


## Inspect training, validation and test datasets

In [11]:
# Inspect the first few examples of the train_dataset
print(train_dataset[0:5])


{'armed_assault': [0, 1, 0, 0, 0], 'arrest': [1, 0, 0, 0, 0], 'bombing': [0, 0, 1, 0, 0], 'infrastructure': [0, 0, 1, 0, 1], 'surrender': [0, 0, 0, 1, 0], 'seizure': [0, 0, 0, 0, 0], 'abduction': [0, 0, 0, 0, 0], 'incident_summary': ['An alleged arms supplier to the Communist Party of India-Maoist (CPI-Maoist), identified as Ravi Kumar Chevori, was arrested from Cyberabad near Hyderabad. He had entered into a deal with the Maoists to supply arms and ammunition worth INR 40 lakh, which the city Police seized on December 28, 2006, and arrested three persons.', 'A TDP leader and former Sarpanch of Jerrela Gram Panchayat in Visakha Agency of Vishakhapatnam District', 'The CPI-Maoist cadres blasted coffee pulping units at Teegalabanda and Pedavalasa villages in G.K. Veedhi mandal and took away nearly 350 bags of graded coffee beans with them. The attack was carried out in protest against the December 27, 2006-killing of its top leaders, Wadkapur Chandramouli and his wife Karuna, in the agen

In [12]:
# Check the format and alignment of labels for the first few examples by slicing
for i in range(5):
    print(train_dataset[i]['labels'])

[0, 1, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0, 0]
[0, 0, 1, 1, 0, 0, 0]
[0, 0, 0, 0, 1, 0, 0]
[0, 0, 0, 1, 0, 0, 0]


In [14]:
import torch
from torch.utils.data import DataLoader

# Convert Hugging Face datasets to PyTorch DataLoader
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

for batch in train_loader:
    print(type(batch['input_ids']))  # Check the type of 'input_ids' in the batch
    print(type(batch['labels']))  # Check the type of 'labels' in the batch
    
    # Optionally, inspect the first few items to understand the structure
    print(batch['input_ids'][:2])  # Print the first 2 items of 'input_ids'
    print(batch['labels'][:2])  # Print the first 2 items of 'labels'
    
    break  # Only inspect the first batch


NameError: name 'DataLoader' is not defined

## Run the model

In [15]:
# import libraries
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import hamming_loss

# define model
num_labels = 7
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = num_labels)

# add evaluation metrics

def compute_metrics(pred):
    labels = pred.label_ids
    preds = (pred.predictions > 0.5).astype(int)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='samples')
    acc = accuracy_score(labels, preds)
    ham_loss = hamming_loss(labels, preds)  # Calculate Hamming Loss
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'hamming_loss': ham_loss  # Include Hamming Loss in the metrics
    }

# specify training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy='epoch',  # Evaluate at the end of each epoch
)

# specify training args
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics  
)

# train the model
trainer.train()

# evaluate the model on the validation set
trainer.evaluate()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  0%|          | 0/4479 [00:00<?, ?it/s]

ValueError: Expected input batch_size (4) to match target batch_size (28).