## Import and clean the data

In [25]:
# import pandas
import pandas as pd

# import data
perp_df = pd.read_csv('data/perpetrator.csv')

# view dataset info
perp_df.info()
perp_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9914 entries, 0 to 9913
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   perpetrator       9914 non-null   object
 1   incident_summary  9914 non-null   object
dtypes: object(2)
memory usage: 155.0+ KB


Unnamed: 0,perpetrator,incident_summary
0,Security,An alleged arms supplier to the Communist Part...
1,Maoist,A Kamareddy dalam (squad) member belonging to ...
2,Security,Senior CPI-Maoist 'Polit Bureau' and 'central ...
3,Maoist,A TDP leader and former Sarpanch of Jerrela Gr...
4,Maoist,The CPI-Maoist cadres blasted coffee pulping u...


## Encode labels

In [26]:
# import label encoder
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit the encoder to the labels and transform them into numeric ids
perp_df['labels'] = label_encoder.fit_transform(satp_df['perpetrator'])

# view dataset info
perp_df.info()
perp_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9914 entries, 0 to 9913
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   perpetrator       9914 non-null   object
 1   incident_summary  9914 non-null   object
 2   labels            9914 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 232.5+ KB


Unnamed: 0,perpetrator,incident_summary,labels
0,Security,An alleged arms supplier to the Communist Part...,1
1,Maoist,A Kamareddy dalam (squad) member belonging to ...,0
2,Security,Senior CPI-Maoist 'Polit Bureau' and 'central ...,1
3,Maoist,A TDP leader and former Sarpanch of Jerrela Gr...,0
4,Maoist,The CPI-Maoist cadres blasted coffee pulping u...,0


In [27]:
# explore labels
perp_df['labels'].unique()

array([1, 0, 2])

In [28]:
# make sure "unknown" category is right
perp_df.loc[satp_df['labels'] == 2].head()

Unnamed: 0,perpetrator,incident_summary,labels
15,Unknown,Andhra Pradesh Police killed two CPI-Maoist ca...,2
19,Unknown,The Nalgonda District Police killed four CPI-M...,2
21,Unknown,"A former Maoist female cadre, Veeramalla Pushp...",2
24,Unknown,A senior cadre and District committee secretar...,2
29,Unknown,Two CPI-Maoist cadres were killed in an encoun...,2


## Apply train-test split

In [29]:
from sklearn.model_selection import train_test_split

# separate text summaries and labels into X and y
X = perp_df.drop(['labels'], axis=1)  # text summaries
y = perp_df['labels']  # labels

# separate the test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# separate the validation set from the training set
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)

# Note: Adjusting `test_size` in the second split to 0.25 means that
# 25% of the training+validation set becomes the validation set,
# and 75% remains as the training set.


## Recombine features into training, validation and test sets

In [35]:
# Recombine the features and labels into pandas DataFrames
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [42]:
# see if labels are distributed properly across three datasets 

# Function to calculate and print label distribution
def print_label_distribution(df, dataset_name):
    label_counts = df['labels'].value_counts(normalize=True)  # Get normalized counts
    print(f"Label distribution in {dataset_name}:")
    print(label_counts)
    print("\n")

# Calculate and print the label distribution for each dataset
print_label_distribution(train_df, "training")
print_label_distribution(val_df, "validation")
print_label_distribution(test_df, "test")



Label distribution in training:
labels
0    0.523537
1    0.429892
2    0.046570
Name: proportion, dtype: float64


Label distribution in validation:
labels
0    0.523449
1    0.430156
2    0.046394
Name: proportion, dtype: float64


Label distribution in test:
labels
0    0.523449
1    0.430156
2    0.046394
Name: proportion, dtype: float64




## Tokenize textual data

In [31]:
from transformers import AutoTokenizer
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['incident_summary'], padding="max_length", truncation=True)

train_dataset = Dataset.from_pandas(train_df).map(tokenize_function, batched=True)
val_dataset = Dataset.from_pandas(val_df).map(tokenize_function, batched=True)
test_dataset = Dataset.from_pandas(test_df).map(tokenize_function, batched=True)

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 5948/5948 [00:00<00:00, 10744.99 examples/s]
Map: 100%|██████████| 1983/1983 [00:00<00:00, 11809.34 examples/s]
Map: 100%|██████████| 1983/1983 [00:00<00:00, 8047.89 examples/s]


## Inspect training, validation and test sets

In [32]:
# Inspect an example of the training set
import torch
print(train_dataset[1])


{'perpetrator': 'Security', 'incident_summary': 'Aurangabad Police of Bihar arrested three wanted CPI-Maoist cadres from different localities under Daudnagar Sub-Division of the same district. Daudnagar SDPO Sanjay Kumar said on the basis of intelligence inputs, a Police team from Goh, Khudowa and Uphara Police Stations carried out raids at several hideouts and arrested three Maoists who have been identified as Sambhu Yadav, Dharmdeo Yadav and Raja Uadav.', 'labels': 1, '__index_level_0__': 1291, 'input_ids': [101, 15240, 13807, 9024, 2610, 1997, 16178, 4727, 2093, 2359, 28780, 1011, 15158, 2923, 28353, 6072, 2013, 2367, 19664, 2104, 4830, 6784, 14346, 4942, 1011, 2407, 1997, 1996, 2168, 2212, 1012, 4830, 6784, 14346, 17371, 6873, 29590, 9600, 2056, 2006, 1996, 3978, 1997, 4454, 20407, 1010, 1037, 2610, 2136, 2013, 2175, 2232, 1010, 1047, 6979, 3527, 4213, 1998, 2039, 11077, 2610, 3703, 3344, 2041, 11217, 2012, 2195, 29588, 2015, 1998, 4727, 2093, 15158, 5130, 2040, 2031, 2042, 4453, 2

In [33]:
# Inspect an example of the validation set
print(val_dataset[1])


{'perpetrator': 'Maoist', 'incident_summary': 'CPI-Maoist cadres killed two persons, Gopiram Koreti and Rajendra Koreti, in Khargaon village in Rajnandgaon District. The victims were abducted from their village earlier this week.', 'labels': 0, '__index_level_0__': 4190, 'input_ids': [101, 28780, 1011, 15158, 2923, 28353, 6072, 2730, 2048, 5381, 1010, 2175, 8197, 6444, 12849, 13465, 2072, 1998, 11948, 19524, 12849, 13465, 2072, 1010, 1999, 1047, 8167, 27073, 2352, 1999, 11948, 7229, 2094, 27073, 2212, 1012, 1996, 5694, 2020, 20361, 2013, 2037, 2352, 3041, 2023, 2733, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [34]:
# Inspect an example of the test set
print(test_dataset[1])


{'perpetrator': 'Security', 'incident_summary': "Ravindra Kumar, a BJP member in Koderma District, arrested for having links with LWEs. Ravindra Kumar alias Raju Mehta, is the proprietor of 'Sarang Explosive' and said to be an active member of the BJP at Phulwaria of Domchanch in Koderma District", 'labels': 1, '__index_level_0__': 6069, 'input_ids': [101, 16806, 17670, 9600, 1010, 1037, 24954, 2266, 1999, 12849, 4063, 2863, 2212, 1010, 4727, 2005, 2383, 6971, 2007, 1048, 18192, 1012, 16806, 17670, 9600, 14593, 25098, 2033, 22893, 1010, 2003, 1996, 21584, 1997, 1005, 7354, 3070, 11355, 1005, 1998, 2056, 2000, 2022, 2019, 3161, 2266, 1997, 1996, 24954, 2012, 6887, 5313, 20031, 2050, 1997, 14383, 14856, 2818, 1999, 12849, 4063, 2863, 2212, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## Run the model

In [38]:
# import libraries
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# define model
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))
#model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# add evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# specify training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy='epoch',  # Evaluate at the end of each epoch
)

# specify training args
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,  # Add this line to include evaluation metrics
)

# train the model
trainer.train()

# evaluate the model on the validation set
trainer.evaluate()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
 11%|█         | 500/4461 [02:54<22:18,  2.96it/s]Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.5045, 'grad_norm': 15.783828735351562, 'learning_rate': 5e-05, 'epoch': 0.34}


 22%|██▏       | 1000/4461 [05:44<19:29,  2.96it/s]Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.288, 'grad_norm': 0.3013747036457062, 'learning_rate': 4.368846250946731e-05, 'epoch': 0.67}


 33%|███▎      | 1487/4461 [08:31<16:41,  2.97it/s]
 33%|███▎      | 1487/4461 [09:24<16:41,  2.97it/s]

{'eval_loss': 0.2643006443977356, 'eval_accuracy': 0.9440242057488654, 'eval_f1': 0.9450847601810747, 'eval_precision': 0.9473588150555086, 'eval_recall': 0.9440242057488654, 'eval_runtime': 53.5722, 'eval_samples_per_second': 37.015, 'eval_steps_per_second': 9.259, 'epoch': 1.0}


 34%|███▎      | 1500/4461 [09:29<27:48,  1.77it/s]   Checkpoint destination directory ./results/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.2637, 'grad_norm': 0.0749640166759491, 'learning_rate': 3.7376925018934615e-05, 'epoch': 1.01}


 45%|████▍     | 2000/4461 [12:20<13:52,  2.96it/s]Checkpoint destination directory ./results/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.2215, 'grad_norm': 0.13457365334033966, 'learning_rate': 3.106538752840192e-05, 'epoch': 1.34}


 56%|█████▌    | 2500/4461 [15:10<11:03,  2.96it/s]Checkpoint destination directory ./results/checkpoint-2500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.2188, 'grad_norm': 0.10632456094026566, 'learning_rate': 2.4753850037869224e-05, 'epoch': 1.68}


                                                   
 67%|██████▋   | 2974/4461 [18:40<08:21,  2.96it/s]

{'eval_loss': 0.2546033263206482, 'eval_accuracy': 0.9525970751386788, 'eval_f1': 0.952250169238803, 'eval_precision': 0.9520659810714699, 'eval_recall': 0.9525970751386788, 'eval_runtime': 48.0779, 'eval_samples_per_second': 41.246, 'eval_steps_per_second': 10.317, 'epoch': 2.0}


 67%|██████▋   | 3000/4461 [18:49<08:19,  2.93it/s]  Checkpoint destination directory ./results/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.1431, 'grad_norm': 3.239152193069458, 'learning_rate': 1.8442312547336533e-05, 'epoch': 2.02}


 78%|███████▊  | 3500/4461 [21:41<05:24,  2.96it/s]Checkpoint destination directory ./results/checkpoint-3500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.149, 'grad_norm': 0.06098836660385132, 'learning_rate': 1.2130775056803838e-05, 'epoch': 2.35}


 90%|████████▉ | 4000/4461 [24:33<02:35,  2.96it/s]Checkpoint destination directory ./results/checkpoint-4000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.0853, 'grad_norm': 0.017973819747567177, 'learning_rate': 5.819237566271144e-06, 'epoch': 2.69}


                                                   
100%|██████████| 4461/4461 [28:01<00:00,  2.65it/s]


{'eval_loss': 0.26060399413108826, 'eval_accuracy': 0.9531013615733737, 'eval_f1': 0.9528084723154996, 'eval_precision': 0.9526091265778872, 'eval_recall': 0.9531013615733737, 'eval_runtime': 48.3782, 'eval_samples_per_second': 40.99, 'eval_steps_per_second': 10.253, 'epoch': 3.0}
{'train_runtime': 1681.7464, 'train_samples_per_second': 10.61, 'train_steps_per_second': 2.653, 'train_loss': 0.22420726605953856, 'epoch': 3.0}


100%|██████████| 496/496 [00:47<00:00, 10.45it/s]


{'eval_loss': 0.26060399413108826,
 'eval_accuracy': 0.9531013615733737,
 'eval_f1': 0.9528084723154996,
 'eval_precision': 0.9526091265778872,
 'eval_recall': 0.9531013615733737,
 'eval_runtime': 47.585,
 'eval_samples_per_second': 41.673,
 'eval_steps_per_second': 10.423,
 'epoch': 3.0}

## Evaluate the model on the test set

In [40]:
trainer.evaluate(test_dataset)

100%|██████████| 496/496 [00:47<00:00, 10.37it/s]


{'eval_loss': 0.25290557742118835,
 'eval_accuracy': 0.9515885022692889,
 'eval_f1': 0.9518842404484158,
 'eval_precision': 0.9522573284725719,
 'eval_recall': 0.9515885022692889,
 'eval_runtime': 48.0274,
 'eval_samples_per_second': 41.289,
 'eval_steps_per_second': 10.327,
 'epoch': 3.0}

## Save the model...

In [41]:
model.save_pretrained('/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/perpetrator')
tokenizer.save_pretrained('/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/perpetrator')

('/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/perpetrator/tokenizer_config.json',
 '/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/perpetrator/special_tokens_map.json',
 '/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/perpetrator/vocab.txt',
 '/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/perpetrator/added_tokens.json',
 '/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/perpetrator/tokenizer.json')

## To use model again...

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained('/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/perpetrator')
tokenizer = AutoTokenizer.from_pretrained('/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/perpetrator')