## Import and clean the data

The data here are coming from the earlier zero-shot model.

In [36]:
# import pandas
import pandas as pd

# import data
satp_df = pd.read_csv('data/satp_zero_shot.csv')

# remove unnecessary columns
satp_df = satp_df.drop(['predicted_label', 'confidence'], axis=1)

# view dataset info
satp_df.info()
satp_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9914 entries, 0 to 9913
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   incident_number   9913 non-null   float64
 1   incident_summary  9914 non-null   object 
 2   original_label    9914 non-null   object 
dtypes: float64(1), object(2)
memory usage: 232.5+ KB


Unnamed: 0,incident_number,incident_summary,original_label
0,101010701.0,An alleged arms supplier to the Communist Part...,action undertaken by security forces
1,101010901.0,A Kamareddy dalam (squad) member belonging to ...,action undertaken by maoist insurgents
2,101030601.0,Senior CPI-Maoist 'Polit Bureau' and 'central ...,action undertaken by security forces
3,101051602.0,A TDP leader and former Sarpanch of Jerrela Gr...,action undertaken by maoist insurgents
4,101060701.0,The CPI-Maoist cadres blasted coffee pulping u...,action undertaken by maoist insurgents


## Encode labels

In [37]:
# import label encoder
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit the encoder to the labels and transform them into numeric ids
satp_df['labels'] = label_encoder.fit_transform(satp_df['original_label'])

# view dataset info
satp_df.info()
satp_df.head()
satp_df['labels'].unique()
satp_df[satp_df['labels'] == 2].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9914 entries, 0 to 9913
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   incident_number   9913 non-null   float64
 1   incident_summary  9914 non-null   object 
 2   original_label    9914 non-null   object 
 3   labels            9914 non-null   int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 309.9+ KB


Unnamed: 0,incident_number,incident_summary,original_label,labels
15,101110701.0,Andhra Pradesh Police killed two CPI-Maoist ca...,unclear who initiated action,2
19,101130701.0,The Nalgonda District Police killed four CPI-M...,unclear who initiated action,2
21,101131201.0,"A former Maoist female cadre, Veeramalla Pushp...",unclear who initiated action,2
24,101150901.0,A senior cadre and District committee secretar...,unclear who initiated action,2
29,101170801.0,Two CPI-Maoist cadres were killed in an encoun...,unclear who initiated action,2


## Apply train-test split

In [38]:
# import train-test split
from sklearn.model_selection import train_test_split

# select examples
train_examples_cat1 = satp_df[satp_df['labels'] == 0].sample(n=15, random_state=42)
train_examples_cat2 = satp_df[satp_df['labels'] == 1].sample(n=15, random_state=42)
train_examples_cat3 = satp_df[satp_df['labels'] == 2].sample(n=20, random_state=42)

# combine examples into a single data frame
train_df = pd.concat([train_examples_cat1, train_examples_cat2, train_examples_cat3])

# remove training examples from the original dataset
remaining_df = satp_df.drop(train_df.index)

# splitting the remaining examples into validation and test sets
val_df, test_df = train_test_split(remaining_df, stratify=remaining_df['labels'], test_size=0.5, random_state=42)

# view data
train_df.info()
test_df.info()
val_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, 7179 to 15
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   incident_number   50 non-null     float64
 1   incident_summary  50 non-null     object 
 2   original_label    50 non-null     object 
 3   labels            50 non-null     int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 2.0+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 4932 entries, 6452 to 9094
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   incident_number   4932 non-null   float64
 1   incident_summary  4932 non-null   object 
 2   original_label    4932 non-null   object 
 3   labels            4932 non-null   int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 192.7+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 4932 entries, 3007 to 8231
Data columns (total 4 columns):
 #

## Tokenize textual data

In [39]:
from transformers import AutoTokenizer
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['incident_summary'], padding="max_length", truncation=True)

train_dataset = Dataset.from_pandas(train_df).map(tokenize_function, batched=True)
val_dataset = Dataset.from_pandas(val_df).map(tokenize_function, batched=True)
test_dataset = Dataset.from_pandas(test_df).map(tokenize_function, batched=True)

Map: 100%|██████████| 50/50 [00:00<00:00, 3294.82 examples/s]
Map: 100%|██████████| 4932/4932 [00:00<00:00, 10997.54 examples/s]
Map: 100%|██████████| 4932/4932 [00:00<00:00, 11746.92 examples/s]


## Run the model

In [40]:
# import libraries
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# define model
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))
#model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# add evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# specify training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy='epoch',  # Evaluate at the end of each epoch
)

# specify training args
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,  # Add this line to include evaluation metrics
)

# train the model
trainer.train()

# evaluate the model on the validation set
trainer.evaluate()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
                                               
 33%|███▎      | 13/39 [02:07<00:08,  3.23it/s]    

{'eval_loss': 1.0971784591674805, 'eval_accuracy': 0.41585563665855635, 'eval_f1': 0.3173054447128347, 'eval_precision': 0.39965557282889946, 'eval_recall': 0.41585563665855635, 'eval_runtime': 123.275, 'eval_samples_per_second': 40.008, 'eval_steps_per_second': 10.002, 'epoch': 1.0}


                                               
 67%|██████▋   | 26/39 [04:13<00:10,  1.23it/s]    

{'eval_loss': 1.1030911207199097, 'eval_accuracy': 0.2840632603406326, 'eval_f1': 0.25252947304732376, 'eval_precision': 0.3596577596312094, 'eval_recall': 0.2840632603406326, 'eval_runtime': 121.2052, 'eval_samples_per_second': 40.691, 'eval_steps_per_second': 10.173, 'epoch': 2.0}


                                               
100%|██████████| 39/39 [06:18<00:00,  9.70s/it]    


{'eval_loss': 1.1103203296661377, 'eval_accuracy': 0.13422546634225466, 'eval_f1': 0.13391498027427287, 'eval_precision': 0.5234416685211463, 'eval_recall': 0.13422546634225466, 'eval_runtime': 120.7204, 'eval_samples_per_second': 40.855, 'eval_steps_per_second': 10.214, 'epoch': 3.0}
{'train_runtime': 378.4163, 'train_samples_per_second': 0.396, 'train_steps_per_second': 0.103, 'train_loss': 1.095693832788712, 'epoch': 3.0}


100%|██████████| 1233/1233 [01:58<00:00, 10.44it/s]


{'eval_loss': 1.1103203296661377,
 'eval_accuracy': 0.13422546634225466,
 'eval_f1': 0.13391498027427287,
 'eval_precision': 0.5234416685211463,
 'eval_recall': 0.13422546634225466,
 'eval_runtime': 118.2419,
 'eval_samples_per_second': 41.711,
 'eval_steps_per_second': 10.428,
 'epoch': 3.0}

## Evaluate the model on the test set

In [41]:
trainer.evaluate(test_dataset)

100%|██████████| 1233/1233 [01:58<00:00, 10.38it/s]


{'eval_loss': 1.1097095012664795,
 'eval_accuracy': 0.14030819140308193,
 'eval_f1': 0.14181359946531785,
 'eval_precision': 0.6406907536398291,
 'eval_recall': 0.14030819140308193,
 'eval_runtime': 119.1435,
 'eval_samples_per_second': 41.395,
 'eval_steps_per_second': 10.349,
 'epoch': 3.0}

## Save the model...

In [42]:
model.save_pretrained('./fine-tune-perpetrator-50')
tokenizer.save_pretrained('./fine-tune-perpetrator-50')

('./fine-tune-perpetrator-50/tokenizer_config.json',
 './fine-tune-perpetrator-50/special_tokens_map.json',
 './fine-tune-perpetrator-50/vocab.txt',
 './fine-tune-perpetrator-50/added_tokens.json',
 './fine-tune-perpetrator-50/tokenizer.json')

## To use model again...

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained('./few_shot_model')
tokenizer = AutoTokenizer.from_pretrained('./few_shot_model')