## Import and clean the data

In [54]:
# import pandas
import pandas as pd

# import data
perp_df = pd.read_csv('data/perpetrator.csv')

# view dataset info
perp_df.info()
perp_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9914 entries, 0 to 9913
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   perpetrator       9914 non-null   object
 1   incident_summary  9914 non-null   object
dtypes: object(2)
memory usage: 155.0+ KB


Unnamed: 0,perpetrator,incident_summary
0,Security,An alleged arms supplier to the Communist Part...
1,Maoist,A Kamareddy dalam (squad) member belonging to ...
2,Security,Senior CPI-Maoist 'Polit Bureau' and 'central ...
3,Maoist,A TDP leader and former Sarpanch of Jerrela Gr...
4,Maoist,The CPI-Maoist cadres blasted coffee pulping u...


## Encode labels

In [55]:
# import label encoder
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit the encoder to the labels and transform them into numeric ids
perp_df['labels'] = label_encoder.fit_transform(perp_df['perpetrator'])

# view dataset info
perp_df.info()
perp_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9914 entries, 0 to 9913
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   perpetrator       9914 non-null   object
 1   incident_summary  9914 non-null   object
 2   labels            9914 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 232.5+ KB


Unnamed: 0,perpetrator,incident_summary,labels
0,Security,An alleged arms supplier to the Communist Part...,1
1,Maoist,A Kamareddy dalam (squad) member belonging to ...,0
2,Security,Senior CPI-Maoist 'Polit Bureau' and 'central ...,1
3,Maoist,A TDP leader and former Sarpanch of Jerrela Gr...,0
4,Maoist,The CPI-Maoist cadres blasted coffee pulping u...,0


In [56]:
# explore labels
perp_df['labels'].unique()

array([1, 0, 2])

In [57]:
# make sure "unknown" category is right
perp_df.loc[perp_df['labels'] == 2].head()

Unnamed: 0,perpetrator,incident_summary,labels
15,Unknown,Andhra Pradesh Police killed two CPI-Maoist ca...,2
19,Unknown,The Nalgonda District Police killed four CPI-M...,2
21,Unknown,"A former Maoist female cadre, Veeramalla Pushp...",2
24,Unknown,A senior cadre and District committee secretar...,2
29,Unknown,Two CPI-Maoist cadres were killed in an encoun...,2


## Apply train-test split

In [58]:
from sklearn.model_selection import train_test_split

# separate text summaries and labels into X and y
X = perp_df.drop(['labels'], axis=1)  # text summaries
y = perp_df['labels']  # labels

# separate the test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# separate the validation set from the training set
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)

# Note: Adjusting `test_size` in the second split to 0.25 means that
# 25% of the training+validation set becomes the validation set,
# and 75% remains as the training set.


## Recombine data frames

In [59]:
# Recombine the features and labels into pandas DataFrames
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [60]:
# see if labels are distributed properly across three datasets 

# Function to calculate and print label distribution
def print_label_distribution(df, dataset_name):
    label_counts = df['labels'].value_counts(normalize=True)  # Get normalized counts
    print(f"Label distribution in {dataset_name}:")
    print(label_counts)
    print("\n")

# Calculate and print the label distribution for each dataset
print_label_distribution(train_df, "training")
print_label_distribution(val_df, "validation")
print_label_distribution(test_df, "test")



Label distribution in training:
labels
0    0.523537
1    0.429892
2    0.046570
Name: proportion, dtype: float64


Label distribution in validation:
labels
0    0.523449
1    0.430156
2    0.046394
Name: proportion, dtype: float64


Label distribution in test:
labels
0    0.523449
1    0.430156
2    0.046394
Name: proportion, dtype: float64




## Tokenize textual data

In [63]:
from transformers import AutoTokenizer
from datasets import Dataset

model_identifiers = [
    'bert-base-uncased',
    'distilbert-base-uncased',
    'eventdata-utd/ConfliBERT-scr-uncased'
]

def tokenize_function(examples, tokenizer):
    return tokenizer(examples['incident_summary'], padding="max_length", truncation=True, max_length=512)

# Assuming train_df, val_df, and test_df are defined
base_train_dataset = Dataset.from_pandas(train_df)
base_val_dataset = Dataset.from_pandas(val_df)
base_test_dataset = Dataset.from_pandas(test_df)

# Initialize a structure to hold the tokenized datasets for each model
tokenized_datasets = {}

for model_id in model_identifiers:
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Tokenize the datasets for each model and store them in the dictionary
    tokenized_datasets[model_id] = {
        'train': base_train_dataset.map(lambda examples: tokenize_function(examples, tokenizer), batched=True),
        'val': base_val_dataset.map(lambda examples: tokenize_function(examples, tokenizer), batched=True),
        'test': base_test_dataset.map(lambda examples: tokenize_function(examples, tokenizer), batched=True)
    }

Map: 100%|██████████| 5948/5948 [00:00<00:00, 7433.98 examples/s]
Map: 100%|██████████| 1983/1983 [00:00<00:00, 8865.06 examples/s]
Map: 100%|██████████| 1983/1983 [00:00<00:00, 8893.70 examples/s]
Map: 100%|██████████| 5948/5948 [00:00<00:00, 11887.06 examples/s]
Map: 100%|██████████| 1983/1983 [00:00<00:00, 11575.17 examples/s]
Map: 100%|██████████| 1983/1983 [00:00<00:00, 11234.50 examples/s]
Map: 100%|██████████| 5948/5948 [00:00<00:00, 9196.55 examples/s]
Map: 100%|██████████| 1983/1983 [00:00<00:00, 9080.14 examples/s]
Map: 100%|██████████| 1983/1983 [00:00<00:00, 9000.35 examples/s]


## Inspect training, validation and test datasets

In [64]:
# inspect the first example of the training set

import torch
for model_id in model_identifiers:
    # access the first element of the training dataset for the current model
    first_element = tokenized_datasets[model_id]['train'][0]
    
    # print the first element
    print(f"First element of the training dataset for {model_id}:")
    print(first_element)
    print("\n")  # Adds a newline for better readability between model outputs


First element of the training dataset for bert-base-uncased:
{'perpetrator': 'Maoist', 'incident_summary': 'One BSF trooper was killed and three others were injured in a landmine explosion at Kone village in Latehar District. This was the third such incident in as many days. The BSF personnel and election staff were returning on foot in the morning when the landmine exploded. The explosion occurred after the Security Forces detected two other landmines and defused them.', 'labels': 0, '__index_level_0__': 5020, 'input_ids': [101, 2028, 18667, 2546, 28224, 2001, 2730, 1998, 2093, 2500, 2020, 5229, 1999, 1037, 2455, 11233, 7738, 2012, 12849, 2638, 2352, 1999, 2397, 8167, 2212, 1012, 2023, 2001, 1996, 2353, 2107, 5043, 1999, 2004, 2116, 2420, 1012, 1996, 18667, 2546, 5073, 1998, 2602, 3095, 2020, 4192, 2006, 3329, 1999, 1996, 2851, 2043, 1996, 2455, 11233, 9913, 1012, 1996, 7738, 4158, 2044, 1996, 3036, 2749, 11156, 2048, 2060, 2455, 11233, 2015, 1998, 13366, 13901, 2068, 1012, 102, 0, 0,

In [65]:
# inspect an example of the validation set

for model_id in model_identifiers:
    # access the first element of the training dataset for the current model
    first_element = tokenized_datasets[model_id]['val'][0]
    
    # print the first element
    print(f"First element of the training dataset for {model_id}:")
    print(first_element)
    print("\n")  # Adds a newline for better readability between model outputs


First element of the training dataset for bert-base-uncased:
{'perpetrator': 'Maoist', 'incident_summary': 'Maoists set ablaze five tractors, one JCB and a mixture machine at Palli Munda village in the Paikmal Police Station in Bargarh District.', 'labels': 0, '__index_level_0__': 8668, 'input_ids': [101, 15158, 5130, 2275, 11113, 24472, 2274, 28292, 1010, 2028, 29175, 2497, 1998, 1037, 8150, 3698, 2012, 14412, 3669, 14163, 8943, 2352, 1999, 1996, 6643, 5480, 9067, 2610, 2276, 1999, 3347, 13484, 2212, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [66]:
# inspect an example of the test set

for model_id in model_identifiers:
    # Access the first element of the training dataset for the current model
    first_element = tokenized_datasets[model_id]['test'][0]
    
    # Print the first element
    print(f"First element of the training dataset for {model_id}:")
    print(first_element)
    print("\n")  # Adds a newline for better readability between model outputs



First element of the training dataset for bert-base-uncased:
{'perpetrator': 'Maoist', 'incident_summary': 'The cadres of CPI-Maoist shot dead Raghu Singh (55), a prominent leader of Salwa Judum (anti-Maoist vigilante group), in Bhopalpatnam area of Bijapur', 'labels': 0, '__index_level_0__': 3555, 'input_ids': [101, 1996, 28353, 6072, 1997, 28780, 1011, 15158, 2923, 2915, 2757, 17768, 6979, 5960, 1006, 4583, 1007, 1010, 1037, 4069, 3003, 1997, 16183, 4213, 18414, 8566, 2213, 1006, 3424, 1011, 15158, 2923, 6819, 20142, 12956, 2177, 1007, 1010, 1999, 1038, 18471, 2389, 4502, 2102, 13129, 2181, 1997, 12170, 3900, 5311, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## Define evaluation metrics

In [67]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

## Run the model

In [68]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# define number of labels
num_labels = 3

# initialize a dictionary to store evaluation metrics 
all_evaluation_metrics = {}

# loop through models
for model_id in model_identifiers:
    model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=num_labels)
    
    # use the respective tokenized datasets for the current model
    train_dataset = tokenized_datasets[model_id]['train']
    val_dataset = tokenized_datasets[model_id]['val']

    # define training args
    training_args = TrainingArguments(
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    output_dir=f"./model_outputs/{model_id}",  
    logging_dir=f"./logs/{model_id}",  
    save_strategy="no",  # Disables saving model checkpoints
    )
    
    # set up trainer 
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )
    
    # train the model
    print(f"Training with {model_id}")
    trainer.train()

    # Directly capture the metrics from the final evaluation
    eval_metrics = trainer.evaluate()
    
    all_evaluation_metrics[model_id] = eval_metrics

# Convert the metrics dictionary to a DataFrame for visualization
df_metrics = pd.DataFrame.from_dict(all_evaluation_metrics, orient='index')

print(df_metrics)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  7%|▋         | 146/2232 [22:51:54<326:41:17, 563.80s/it]


Training with bert-base-uncased


 33%|███▎      | 744/2232 [13:55<24:18,  1.02it/s]

{'loss': 0.3611, 'grad_norm': 0.09154047816991806, 'learning_rate': 4.2956120092378757e-05, 'epoch': 1.0}


                                                  
 33%|███▎      | 744/2232 [15:11<24:18,  1.02it/s]

{'eval_loss': 0.2598115801811218, 'eval_accuracy': 0.9430156328794755, 'eval_f1': 0.9429188537074977, 'eval_precision': 0.9439485841009252, 'eval_recall': 0.9430156328794755, 'eval_runtime': 75.9651, 'eval_samples_per_second': 26.104, 'eval_steps_per_second': 3.265, 'epoch': 1.0}


 67%|██████▋   | 1488/2232 [28:47<12:01,  1.03it/s] 

{'loss': 0.1739, 'grad_norm': 0.0708642303943634, 'learning_rate': 2.1478060046189378e-05, 'epoch': 2.0}


                                                   
 67%|██████▋   | 1488/2232 [30:03<12:01,  1.03it/s]

{'eval_loss': 0.2406594604253769, 'eval_accuracy': 0.9500756429652042, 'eval_f1': 0.9496254506421651, 'eval_precision': 0.9494246541482343, 'eval_recall': 0.9500756429652042, 'eval_runtime': 75.2102, 'eval_samples_per_second': 26.366, 'eval_steps_per_second': 3.297, 'epoch': 2.0}


100%|██████████| 2232/2232 [43:35<00:00,  1.03it/s]  

{'loss': 0.1128, 'grad_norm': 0.17898958921432495, 'learning_rate': 0.0, 'epoch': 3.0}


                                                   
100%|██████████| 2232/2232 [44:51<00:00,  1.21s/it]


{'eval_loss': 0.209818035364151, 'eval_accuracy': 0.9556227937468482, 'eval_f1': 0.9557344095892589, 'eval_precision': 0.9559143456679965, 'eval_recall': 0.9556227937468482, 'eval_runtime': 75.222, 'eval_samples_per_second': 26.362, 'eval_steps_per_second': 3.297, 'epoch': 3.0}
{'train_runtime': 2691.0377, 'train_samples_per_second': 6.631, 'train_steps_per_second': 0.829, 'train_loss': 0.2159143024020725, 'epoch': 3.0}


100%|██████████| 248/248 [01:14<00:00,  3.32it/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Training with distilbert-base-uncased


 33%|███▎      | 744/2232 [07:23<12:53,  1.92it/s]

{'loss': 0.3386, 'grad_norm': 1.0955374240875244, 'learning_rate': 4.2956120092378757e-05, 'epoch': 1.0}


                                                  
 33%|███▎      | 744/2232 [08:07<12:53,  1.92it/s]

{'eval_loss': 0.22191715240478516, 'eval_accuracy': 0.951084215834594, 'eval_f1': 0.9511936823258561, 'eval_precision': 0.9519536652152895, 'eval_recall': 0.951084215834594, 'eval_runtime': 44.8493, 'eval_samples_per_second': 44.215, 'eval_steps_per_second': 5.53, 'epoch': 1.0}


 67%|██████▋   | 1488/2232 [15:30<06:24,  1.93it/s] 

{'loss': 0.1661, 'grad_norm': 0.09121710062026978, 'learning_rate': 2.1478060046189378e-05, 'epoch': 2.0}


                                                   
 67%|██████▋   | 1488/2232 [16:15<06:24,  1.93it/s]

{'eval_loss': 0.199649840593338, 'eval_accuracy': 0.9546142208774584, 'eval_f1': 0.9543327228033479, 'eval_precision': 0.9541297500093014, 'eval_recall': 0.9546142208774584, 'eval_runtime': 44.8309, 'eval_samples_per_second': 44.233, 'eval_steps_per_second': 5.532, 'epoch': 2.0}


100%|██████████| 2232/2232 [23:38<00:00,  1.94it/s]  

{'loss': 0.1017, 'grad_norm': 0.14623229205608368, 'learning_rate': 0.0, 'epoch': 3.0}


                                                   
100%|██████████| 2232/2232 [24:23<00:00,  1.53it/s]


{'eval_loss': 0.20655563473701477, 'eval_accuracy': 0.9561270801815431, 'eval_f1': 0.955872050544694, 'eval_precision': 0.9557121587657664, 'eval_recall': 0.9561270801815431, 'eval_runtime': 44.8449, 'eval_samples_per_second': 44.219, 'eval_steps_per_second': 5.53, 'epoch': 3.0}
{'train_runtime': 1463.2621, 'train_samples_per_second': 12.195, 'train_steps_per_second': 1.525, 'train_loss': 0.20210748761358227, 'epoch': 3.0}


100%|██████████| 248/248 [00:44<00:00,  5.57it/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at eventdata-utd/ConfliBERT-scr-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Training with eventdata-utd/ConfliBERT-scr-uncased


 33%|███▎      | 744/2232 [13:30<23:32,  1.05it/s]

{'loss': 0.3679, 'grad_norm': 0.16638343036174774, 'learning_rate': 4.2956120092378757e-05, 'epoch': 1.0}


                                                  
 33%|███▎      | 744/2232 [14:45<23:32,  1.05it/s]

{'eval_loss': 0.19624817371368408, 'eval_accuracy': 0.9515885022692889, 'eval_f1': 0.951207273123725, 'eval_precision': 0.9510620274742683, 'eval_recall': 0.9515885022692889, 'eval_runtime': 75.1642, 'eval_samples_per_second': 26.382, 'eval_steps_per_second': 3.299, 'epoch': 1.0}


 67%|██████▋   | 1488/2232 [28:39<11:55,  1.04it/s] 

{'loss': 0.1779, 'grad_norm': 0.053008802235126495, 'learning_rate': 2.1478060046189378e-05, 'epoch': 2.0}


                                                   
 67%|██████▋   | 1488/2232 [29:56<11:55,  1.04it/s]

{'eval_loss': 0.23040008544921875, 'eval_accuracy': 0.9541099344427635, 'eval_f1': 0.9529990131458648, 'eval_precision': 0.9532275646494137, 'eval_recall': 0.9541099344427635, 'eval_runtime': 76.8299, 'eval_samples_per_second': 25.81, 'eval_steps_per_second': 3.228, 'epoch': 2.0}


100%|██████████| 2232/2232 [44:00<00:00,  1.01it/s]  

{'loss': 0.1163, 'grad_norm': 0.14526313543319702, 'learning_rate': 0.0, 'epoch': 3.0}


                                                   
100%|██████████| 2232/2232 [45:16<00:00,  1.22s/it]


{'eval_loss': 0.19265031814575195, 'eval_accuracy': 0.9591527987897126, 'eval_f1': 0.9590704264490462, 'eval_precision': 0.9591147973339961, 'eval_recall': 0.9591527987897126, 'eval_runtime': 75.9126, 'eval_samples_per_second': 26.122, 'eval_steps_per_second': 3.267, 'epoch': 3.0}
{'train_runtime': 2716.4511, 'train_samples_per_second': 6.569, 'train_steps_per_second': 0.822, 'train_loss': 0.22070582878632358, 'epoch': 3.0}


100%|██████████| 248/248 [01:16<00:00,  3.26it/s]

                                      eval_loss  eval_accuracy   eval_f1  \
bert-base-uncased                      0.209818       0.955623  0.955734   
distilbert-base-uncased                0.206556       0.956127  0.955872   
eventdata-utd/ConfliBERT-scr-uncased   0.192650       0.959153  0.959070   

                                      eval_precision  eval_recall  \
bert-base-uncased                           0.955914     0.955623   
distilbert-base-uncased                     0.955712     0.956127   
eventdata-utd/ConfliBERT-scr-uncased        0.959115     0.959153   

                                      eval_runtime  eval_samples_per_second  \
bert-base-uncased                          75.1027                   26.404   
distilbert-base-uncased                    44.6793                   44.383   
eventdata-utd/ConfliBERT-scr-uncased       76.4022                   25.955   

                                      eval_steps_per_second  epoch  
bert-base-uncased               




## Evaluate the model on the test set

In [53]:

# dictionary to store evaluation results
evaluation_results = {}

for model_id in model_identifiers:
    print(f"Evaluating {model_id} on the test dataset...")
    # retrieve the appropriate test dataset for the current model
    test_dataset = tokenized_datasets[model_id]['test']
    
    # evaluate the model on the test dataset
    result = trainers[model_id].evaluate(test_dataset)
    
    # store the results
    evaluation_results[model_id] = result

    # print the evaluation results for this model
    print(f"Results for {model_id}: {result}\n")


Evaluating bert-base-uncased on the test dataset...


NameError: name 'trainers' is not defined

## Save the model...

In [41]:
model.save_pretrained('/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/perpetrator')
tokenizer.save_pretrained('/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/perpetrator')

('/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/perpetrator/tokenizer_config.json',
 '/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/perpetrator/special_tokens_map.json',
 '/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/perpetrator/vocab.txt',
 '/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/perpetrator/added_tokens.json',
 '/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/perpetrator/tokenizer.json')

## To use model again...

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained('/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/perpetrator')
tokenizer = AutoTokenizer.from_pretrained('/Users/ejt/Library/CloudStorage/Dropbox/Projects/code-satp-models/perpetrator')