# Libraries

In [1]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import re
import numpy as np


In [2]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, get_cosine_schedule_with_warmup

MODEL_NAMES = {
    "bert": 'google-bert/bert-base-uncased',
    "xlnet": 'xlnet/xlnet-base-cased',
}

model = "bert"
model_name = MODEL_NAMES[model]

tokenizer = AutoTokenizer.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


Utilities

In [3]:
from datetime import datetime

def get_current_datetime():
    current_date = datetime.now().strftime("%B %d, %Y")
    current_time = datetime.now().strftime("%I:%M %p")

    return current_date, current_time

def print_current_datetime():
    date, time = get_current_datetime()
    print(f'({date} | {time})')
    

# Dataset

In [4]:
dataset_dir = "data/train_test_val"

raw_dataset = {
    'train': pd.read_csv(dataset_dir + '/train.csv').reset_index(drop=True), 
    'test': pd.read_csv(dataset_dir + '/test.csv').reset_index(drop=True), 
    'val': pd.read_csv(dataset_dir + '/val.csv').reset_index(drop=True), 
}

LABELS = [label for label in raw_dataset['train'].keys() if label not in ['ID', 'Text']]
id2label = {idx:label for idx, label in enumerate(LABELS)}
label2id = {label:idx for idx, label in enumerate(LABELS)}

class CrimeDataset(Dataset):
    def __init__(self, data, tokenizer, labels, max_token_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.labels = labels
        self.max_token_len = max_token_len
        self.encoded_dataset = self.encode_dataset()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.encoded_dataset[index]
    
    def preprocess(text_column):
        
        text_column = text_column.apply(lambda x: re.sub(r'[A-Z]', lambda y: y.group(0).lower(), x))
        # Removal of unimportant links
        text_column = text_column.apply(lambda x: re.sub(r'http[s]?://\S+', '', x))

        # emoji 
        text_column = text_column.apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))

        # username
        text_column = text_column.apply(lambda x: re.sub(r'@\w+', '', x))

        # punctuations
        text_column = text_column.apply(lambda x: re.sub(r'[^\w\s]', ' ', x))

        # hashtag
        text_column = text_column.apply(lambda x: re.sub(r'#', '', x))
        
        return text_column

    def encode_dataset(self):
        encoded_dataset = []
        for index, data in tqdm(self.data.iterrows()):
            encoded_data = self.encode_data(data)
            encoded_dataset.append(encoded_data)
        return encoded_dataset

    def encode_data(self, data):
        text = data["Text"]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=128,
            return_token_type_ids=False,
            return_attention_mask=True
        )
        labels = [data[label] for label in self.labels]

        representation = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(labels)
        }
        return representation

class CrimeDataLoader:
    def __init__(self, dataset, labels, tokenizer, batch_size=8):
        self.train_dataset = CrimeDataset(dataset['train'], tokenizer, labels)
        self.val_dataset = CrimeDataset(dataset['val'], tokenizer, labels)
        self.test_dataset = CrimeDataset(dataset['test'], tokenizer, labels)
        self.batch_size = batch_size

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=0, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=0, shuffle=False)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=0, shuffle=False)

Preprocessor

In [5]:
def preprocess_text(text_column):

    text_column = text_column.apply(lambda x: re.sub(r'[A-Z]', lambda y: y.group(0).lower(), x))
    # Removal of unimportant links
    text_column = text_column.apply(lambda x: re.sub(r'http[s]?://\S+', '', x))

    # emoji 
    text_column = text_column.apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))

    # username
    text_column = text_column.apply(lambda x: re.sub(r'@\w+', '', x))

    # punctuations
    text_column = text_column.apply(lambda x: re.sub(r'[^\w\s]', ' ', x))

    # hashtag
    text_column = text_column.apply(lambda x: re.sub(r'#', '', x))
    
    return text_column

DataLoader

In [6]:
dataloader = CrimeDataLoader(dataset=raw_dataset, tokenizer=tokenizer, labels=LABELS)

def split_dataloader(dataloader):
    train_dataloader = dataloader.train_dataloader()
    val_dataloader = dataloader.val_dataloader()
    test_dataloader = dataloader.test_dataloader()

    return train_dataloader, val_dataloader, test_dataloader

train_dataloader, val_dataloader, test_dataloader = split_dataloader(dataloader)

0it [00:00, ?it/s]

2400it [00:01, 1774.77it/s]
400it [00:00, 2084.38it/s]
1200it [00:00, 2023.14it/s]


Class Weights
- dahil imbalanced ang dataset, gagamitin natin eto para ibalance ang loss

In [7]:
def get_class_weights(num_labels, dataloader):
    class_counts = torch.zeros(num_labels)
    for batch in dataloader:
        labels = batch['labels']
        class_counts += labels.sum(dim=0)

    # Calculate class weights
    total_samples = class_counts.sum()
    class_weights = total_samples / (num_labels * class_counts)

    # Normalize weights so that the sum is equal to the number of classes
    class_weights /= class_weights.sum()

    print("Class Weights:", class_weights)

    return class_weights

num_labels = len(LABELS)
class_weights = get_class_weights(num_labels, train_dataloader)

Class Weights: tensor([0.1265, 0.1223, 0.1059, 0.1240, 0.1131, 0.1377, 0.1254, 0.1451])


# Model


In [8]:
class BERTCrimeClassifier(nn.Module):
    def __init__(self, model_name, batch_size=8, epochs=5, dropout=0.1):
        super(BERTCrimeClassifier, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.hidden_linear = nn.Linear(self.model.config.hidden_size, self.model.config.hidden_size)  # Additional Linear Layer
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(self.model.config.hidden_size, len(LABELS))

    def forward(self, ids, mask):
        bert_outputs = self.model(ids, attention_mask=mask)
        cls_hidden_state = bert_outputs.last_hidden_state[:, 0, :] # contextual embeddings / pooling
        hidden_output = self.hidden_linear(cls_hidden_state)  # dense
        dropped_out = self.dropout(hidden_output)  # dropout
        logits = self.linear(dropped_out)  # dense
        return logits # raw scores (13232.324239432) -> 


Setup the classifier 

In [10]:
batch_size = 8
epochs = 5
learning_rate = 2e-5
weight_decay = 0.002
dropout = 0.5

criterion = nn.BCEWithLogitsLoss(reduction='mean', weight=class_weights)

def loss_fn(outputs, targets):
    return criterion(outputs, targets)


bertCrimeClassifier = BERTCrimeClassifier(model_name, len(LABELS), dropout=dropout)
bertCrimeClassifier

optimizer = torch.optim.Adam(bertCrimeClassifier.parameters(), lr=learning_rate, weight_decay=weight_decay) 
scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=int(dataloader.train_dataset.__len__() / batch_size + 1))

In [11]:
def save_model(model):
    current_datetime = datetime.now().strftime("(%m-%d-%Y)_(%I-%M-%S-%p)")
    current_date = datetime.now().strftime("%B %d, %Y")
    save_dir = os.path.join("../saved_models", current_date)
    os.makedirs(save_dir, exist_ok=True)
    file_name = f"bert-{current_datetime}.pth"
    model_save_path = os.path.join(save_dir, file_name)
    torch.save(model.state_dict(), model_save_path)


save_model(bertCrimeClassifier)


# Metrics

- dito ay kukunin natin ang confusion matrix sa bawat labels para makuha ang TP, TN, FP, FN
- gagamitin din natin to para makuha ang most frequent errors
- ginamit yung formula ng precisionl, recall, f-measure para makuha prf per label
- xor method to calculate hamming loss

In [11]:
import numpy as np
from sklearn.metrics import multilabel_confusion_matrix

def calculate_metrics(confusion_matrix):
    TP = confusion_matrix[1, 1]  # True Positives
    FP = confusion_matrix[0, 1]  # False Positives
    FN = confusion_matrix[1, 0]  # False Negatives
    TN = confusion_matrix[0, 0]  # True Negatives

    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0

    return {
        "Precision": float(precision),
        "Recall": float(recall),
        "F1-Score": float(f1_score)
    }

def hamming_loss(y_true, y_pred):
    xor_result = np.logical_xor(y_true, y_pred)
    xor_sum = np.sum(xor_result)
    hamming_loss = xor_sum / (y_true.shape[0] * y_true.shape[1])
    return hamming_loss

def multilabel_metrics(predictions, labels, mode="validation", threshold=0.5):
    sigmoid = torch.nn.Sigmoid()
    probabilities = sigmoid(torch.Tensor(predictions))

    y_pred = np.zeros(probabilities.shape)
    y_pred[np.where(probabilities >= threshold)] = 1

    y_true = np.zeros(labels.shape)
    y_true[np.where(labels == 1)] = 1

    confusion_matrix = multilabel_confusion_matrix(y_true, y_pred)

    print("Confusion Matrix")
    print(confusion_matrix)

    label_metrics = {}

    for i, class_name in enumerate(LABELS):
        metrics = calculate_metrics(confusion_matrix[i])
        label_metrics[class_name] = metrics

        precision = metrics['Precision'] * 100
        recall = metrics['Recall'] * 100
        f1 = metrics['F1-Score'] * 100

        print(f"{class_name}")
        print(f"    Precision: {precision:.2f}%")
        print(f"    Recall: {recall:.2f}%")
        print(f"    F-Measure: {f1:.2f}%")

    label_metrics['hamming_loss'] = hamming_loss(y_true, y_pred)
    print(f"\nHamming Loss ({mode}): {label_metrics['hamming_loss']}")

    return label_metrics



# Training

### Hyperparameters

Ang purpose ng loss function ay para icalculate ang difference between models' prediction and true labels/targets

Ang loss ay something na gusto nating mabawasan sa pamamagitan ng pagtetrain ng model, ito ay ibabackpropagate natin where icacalculate ang gradient of loss ng bawat model's parameters

Optimizer naman ang gagamitin natin to actually update the model's parameters 

In [12]:
def eval_model(model, val_dataloader, mode="validation"):

    model.eval()

    val_loss = 0.0
    predictions = []
    labels_list  = []

    with torch.no_grad():
        for batch in tqdm(val_dataloader):
            
            val_input_ids = batch['input_ids']
            val_attention_mask = batch['attention_mask']
            val_labels = batch['labels']

            outputs = model(val_input_ids, val_attention_mask)

            if mode == "validation":
                loss = criterion(outputs, val_labels)
                val_loss += loss.item()

            predictions.append(outputs.cpu().numpy())
            labels_list.append(val_labels.cpu().numpy())

    predictions = np.vstack(predictions)
    labels = np.vstack(labels_list)  
    metrics = multilabel_metrics(predictions=predictions, labels=labels, threshold=0.5)

    average_val_loss = val_loss / len(val_dataloader)
    return average_val_loss, metrics

In [13]:
"""

Training Loop

1. Forward pass
2. Calculate Loss
3. Backpropagation 
4. Update model paramters (gradient descent)

"""

def train_model(classifier, epochs=5, max_batches=None, save_file_name="trained_model", save_model=True, log_interval = 8):

    early_stop_patience=2
    best_val_loss = float('inf')
    no_improvement_count = 0

    train_losses = []
    val_losses = []

    print_current_datetime()  # 1260 train data / batch size = 8 = 156 batches

    for epoch_idx in tqdm(range(1, epochs+1), desc=f"Epoch"):

        print(f"Running epoch {epoch_idx}")
        print_current_datetime()

        classifier.train()
        train_loss = 0.0

        for batch_idx, batch in enumerate(train_dataloader):

            if max_batches is not None and batch_idx >= max_batches:
                break
            
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            # forward pass
            outputs = classifier(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            
            # log loss every n batches
            if batch_idx % log_interval == 0:
                print(f"Epoch {epoch_idx}, Batch {batch_idx}, Loss: {loss.item()}")

            # backpropagation
            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            nn.utils.clip_grad_norm_(classifier.parameters(), max_norm=1.0)

            # gradient descent (update the model's parameters)
            optimizer.step()

            # reduce learning rate
            # scheduler.step()

            train_loss += loss.item()

        average_train_loss = train_loss / len(train_dataloader)

        print(f"Epoch {epoch_idx}, Average Loss: {average_train_loss}")
        print_current_datetime()

        # Evaluate using validation set, dito natin malalaman kung nag-iimprove ang model
        # Ineexpect natin na dapat bumaba ang loss for each epoch 
        val_loss, val_metrics = eval_model(classifier, val_dataloader)
        print(f"Validation Loss: {val_loss}")
        print("Validation Metrics:", val_metrics)

        # Graph losses so far for each epoch
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        # plot_losses(train_losses, val_losses)

        # # Graph metrices so far for each epoch
        # plot_losses(epoch=epoch_list, metrics=val_metrics_history)

        # Tandaan, mas maganda ang lesser validation loss, kaya yan isesave natin
        if val_loss < best_val_loss:
            best_val_loss = val_loss

            if save_model:
                current_datetime = datetime.now().strftime("(%m-%d-%Y)_(%I-%M-%S-%p)")
                current_date = datetime.now().strftime("%B %d, %Y")
                save_dir = os.path.join("../saved_models", current_date)
                os.makedirs(save_dir, exist_ok=True)
                file_name = f"{model}-{current_datetime}.pth"
                model_save_path = os.path.join(save_dir, file_name)
                torch.save(classifier.state_dict(), model_save_path)

            print("Saved the best model!")
            
            no_improvement_count = 0
        else:
            no_improvement_count += 1

        # Early stoping - tigil training kapag di na nag-improve ang validation loss after certain number of epoch (3)
        if no_improvement_count >= early_stop_patience:
            print(f"No improvement for {early_stop_patience} epochs. Stopping early.")
            break
    
    print("Training complete. Happy hacking I guess...")
    print("Model is ready for evaluation")

    return classifier


### Start Training

default values: 
    
    - max_batches = None
    - epochs = 5

In [14]:
trained_model = train_model(bertCrimeClassifier,  max_batches=None,  epochs=epochs, save_model=True)

(September 20, 2024 | 09:57 PM)


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running epoch 1
(September 20, 2024 | 09:57 PM)
Epoch 1, Batch 0, Loss: 0.09093575179576874
Epoch 1, Batch 8, Loss: 0.07642907649278641
Epoch 1, Batch 16, Loss: 0.06257613003253937
Epoch 1, Batch 24, Loss: 0.05896478146314621
Epoch 1, Batch 32, Loss: 0.05170154571533203
Epoch 1, Batch 40, Loss: 0.04871226102113724
Epoch 1, Batch 48, Loss: 0.04918375983834267
Epoch 1, Batch 56, Loss: 0.04850951209664345
Epoch 1, Batch 64, Loss: 0.05225539952516556
Epoch 1, Batch 72, Loss: 0.04783044755458832
Epoch 1, Batch 80, Loss: 0.048366840928792953
Epoch 1, Batch 88, Loss: 0.05573730170726776
Epoch 1, Batch 96, Loss: 0.050438471138477325
Epoch 1, Batch 104, Loss: 0.05357811227440834
Epoch 1, Batch 112, Loss: 0.04747605323791504
Epoch 1, Batch 120, Loss: 0.046980734914541245
Epoch 1, Batch 128, Loss: 0.044442009180784225
Epoch 1, Batch 136, Loss: 0.047932218760252
Epoch 1, Batch 144, Loss: 0.04197634756565094
Epoch 1, Batch 152, Loss: 0.045218709856271744
Epoch 1, Batch 160, Loss: 0.0484796538949012

100%|██████████| 50/50 [00:58<00:00,  1.18s/it]


Confusion Matrix
[[[323   0]
  [ 41  36]]

 [[335   0]
  [ 62   3]]

 [[320   0]
  [ 78   2]]

 [[320  12]
  [ 20  48]]

 [[335   0]
  [  9  56]]

 [[374   0]
  [  6  20]]

 [[345   0]
  [ 15  40]]

 [[353   0]
  [ 35  12]]]
Murder
    Precision: 100.00%
    Recall: 46.75%
    F-Measure: 63.72%
Homicide
    Precision: 100.00%
    Recall: 4.62%
    F-Measure: 8.82%
Robbery
    Precision: 100.00%
    Recall: 2.50%
    F-Measure: 4.88%
Physical Injuries
    Precision: 80.00%
    Recall: 70.59%
    F-Measure: 75.00%
Rape
    Precision: 100.00%
    Recall: 86.15%
    F-Measure: 92.56%
Theft
    Precision: 100.00%
    Recall: 76.92%
    F-Measure: 86.96%
Carnapping
    Precision: 100.00%
    Recall: 72.73%
    F-Measure: 84.21%
Others
    Precision: 100.00%
    Recall: 25.53%
    F-Measure: 40.68%

Hamming Loss (validation): 0.086875
Validation Loss: 0.029192046001553534
Validation Metrics: {'Murder': {'Precision': 1.0, 'Recall': 0.4675324675324675, 'F1-Score': 0.6371681415929203}, 'Homicide

Epoch:  20%|██        | 1/5 [21:15<1:25:00, 1275.15s/it]

Saved the best model!
Running epoch 2
(September 20, 2024 | 10:19 PM)
Epoch 2, Batch 0, Loss: 0.029317747801542282
Epoch 2, Batch 8, Loss: 0.02801457978785038
Epoch 2, Batch 16, Loss: 0.026511237025260925
Epoch 2, Batch 24, Loss: 0.02892397716641426
Epoch 2, Batch 32, Loss: 0.030833449214696884
Epoch 2, Batch 40, Loss: 0.023556506261229515
Epoch 2, Batch 48, Loss: 0.027276290580630302
Epoch 2, Batch 56, Loss: 0.02932162582874298
Epoch 2, Batch 64, Loss: 0.023438019677996635
Epoch 2, Batch 72, Loss: 0.022251086309552193
Epoch 2, Batch 80, Loss: 0.02277091145515442
Epoch 2, Batch 88, Loss: 0.022817393764853477
Epoch 2, Batch 96, Loss: 0.022889884188771248
Epoch 2, Batch 104, Loss: 0.026359912008047104
Epoch 2, Batch 112, Loss: 0.023952387273311615
Epoch 2, Batch 120, Loss: 0.023720184341073036
Epoch 2, Batch 128, Loss: 0.02445327863097191
Epoch 2, Batch 136, Loss: 0.018879523500800133
Epoch 2, Batch 144, Loss: 0.02001408115029335
Epoch 2, Batch 152, Loss: 0.02473750151693821
Epoch 2, Bat

100%|██████████| 50/50 [00:57<00:00,  1.15s/it]


Confusion Matrix
[[[315   8]
  [ 10  67]]

 [[330   5]
  [ 23  42]]

 [[317   3]
  [  8  72]]

 [[320  12]
  [  9  59]]

 [[333   2]
  [  0  65]]

 [[374   0]
  [  1  25]]

 [[343   2]
  [  0  55]]

 [[353   0]
  [ 19  28]]]
Murder
    Precision: 89.33%
    Recall: 87.01%
    F-Measure: 88.16%
Homicide
    Precision: 89.36%
    Recall: 64.62%
    F-Measure: 75.00%
Robbery
    Precision: 96.00%
    Recall: 90.00%
    F-Measure: 92.90%
Physical Injuries
    Precision: 83.10%
    Recall: 86.76%
    F-Measure: 84.89%
Rape
    Precision: 97.01%
    Recall: 100.00%
    F-Measure: 98.48%
Theft
    Precision: 100.00%
    Recall: 96.15%
    F-Measure: 98.04%
Carnapping
    Precision: 96.49%
    Recall: 100.00%
    F-Measure: 98.21%
Others
    Precision: 100.00%
    Recall: 59.57%
    F-Measure: 74.67%

Hamming Loss (validation): 0.031875
Validation Loss: 0.016428285408765078
Validation Metrics: {'Murder': {'Precision': 0.8933333333333333, 'Recall': 0.8701298701298701, 'F1-Score': 0.881578947368

Epoch:  40%|████      | 2/5 [42:34<1:03:53, 1277.77s/it]

Saved the best model!
Running epoch 3
(September 20, 2024 | 10:40 PM)
Epoch 3, Batch 0, Loss: 0.016523104161024094
Epoch 3, Batch 8, Loss: 0.015450440347194672
Epoch 3, Batch 16, Loss: 0.017292708158493042
Epoch 3, Batch 24, Loss: 0.014772729948163033
Epoch 3, Batch 32, Loss: 0.01789373718202114
Epoch 3, Batch 40, Loss: 0.018188785761594772
Epoch 3, Batch 48, Loss: 0.017925359308719635
Epoch 3, Batch 56, Loss: 0.01175830326974392
Epoch 3, Batch 64, Loss: 0.015512541867792606
Epoch 3, Batch 72, Loss: 0.014487409964203835
Epoch 3, Batch 80, Loss: 0.012243852019309998
Epoch 3, Batch 88, Loss: 0.014422273263335228
Epoch 3, Batch 96, Loss: 0.012878306210041046
Epoch 3, Batch 104, Loss: 0.01354643702507019
Epoch 3, Batch 112, Loss: 0.011391397565603256
Epoch 3, Batch 120, Loss: 0.015831267461180687
Epoch 3, Batch 128, Loss: 0.01764463633298874
Epoch 3, Batch 136, Loss: 0.011627408675849438
Epoch 3, Batch 144, Loss: 0.010199345648288727
Epoch 3, Batch 152, Loss: 0.01077733002603054
Epoch 3, B

100%|██████████| 50/50 [00:59<00:00,  1.18s/it]


Confusion Matrix
[[[323   0]
  [ 13  64]]

 [[335   0]
  [ 13  52]]

 [[320   0]
  [ 14  66]]

 [[327   5]
  [  5  63]]

 [[335   0]
  [  1  64]]

 [[372   2]
  [  1  25]]

 [[345   0]
  [  0  55]]

 [[341  12]
  [  1  46]]]
Murder
    Precision: 100.00%
    Recall: 83.12%
    F-Measure: 90.78%
Homicide
    Precision: 100.00%
    Recall: 80.00%
    F-Measure: 88.89%
Robbery
    Precision: 100.00%
    Recall: 82.50%
    F-Measure: 90.41%
Physical Injuries
    Precision: 92.65%
    Recall: 92.65%
    F-Measure: 92.65%
Rape
    Precision: 100.00%
    Recall: 98.46%
    F-Measure: 99.22%
Theft
    Precision: 92.59%
    Recall: 96.15%
    F-Measure: 94.34%
Carnapping
    Precision: 100.00%
    Recall: 100.00%
    F-Measure: 100.00%
Others
    Precision: 79.31%
    Recall: 97.87%
    F-Measure: 87.62%

Hamming Loss (validation): 0.0209375
Validation Loss: 0.012052114484831691
Validation Metrics: {'Murder': {'Precision': 1.0, 'Recall': 0.8311688311688312, 'F1-Score': 0.9078014184397163}, 'Hom

Epoch:  60%|██████    | 3/5 [1:03:38<42:22, 1271.30s/it]

Saved the best model!
Running epoch 4
(September 20, 2024 | 11:01 PM)
Epoch 4, Batch 0, Loss: 0.008649663999676704
Epoch 4, Batch 8, Loss: 0.012195988558232784
Epoch 4, Batch 16, Loss: 0.007515156641602516
Epoch 4, Batch 24, Loss: 0.012318624183535576
Epoch 4, Batch 32, Loss: 0.00918235071003437
Epoch 4, Batch 40, Loss: 0.012081257067620754
Epoch 4, Batch 48, Loss: 0.012488816864788532
Epoch 4, Batch 56, Loss: 0.009356142021715641
Epoch 4, Batch 64, Loss: 0.007511859759688377
Epoch 4, Batch 72, Loss: 0.011947203427553177
Epoch 4, Batch 80, Loss: 0.010180718265473843
Epoch 4, Batch 88, Loss: 0.00804083701223135
Epoch 4, Batch 96, Loss: 0.013819226063787937
Epoch 4, Batch 104, Loss: 0.011212479323148727
Epoch 4, Batch 112, Loss: 0.009775036945939064
Epoch 4, Batch 120, Loss: 0.009394246153533459
Epoch 4, Batch 128, Loss: 0.0088485823944211
Epoch 4, Batch 136, Loss: 0.009089414030313492
Epoch 4, Batch 144, Loss: 0.00809035450220108
Epoch 4, Batch 152, Loss: 0.00888030230998993
Epoch 4, Ba

100%|██████████| 50/50 [01:02<00:00,  1.25s/it]


Confusion Matrix
[[[323   0]
  [  4  73]]

 [[334   1]
  [  1  64]]

 [[314   6]
  [  3  77]]

 [[331   1]
  [  5  63]]

 [[334   1]
  [  2  63]]

 [[373   1]
  [  1  25]]

 [[345   0]
  [  0  55]]

 [[353   0]
  [  9  38]]]
Murder
    Precision: 100.00%
    Recall: 94.81%
    F-Measure: 97.33%
Homicide
    Precision: 98.46%
    Recall: 98.46%
    F-Measure: 98.46%
Robbery
    Precision: 92.77%
    Recall: 96.25%
    F-Measure: 94.48%
Physical Injuries
    Precision: 98.44%
    Recall: 92.65%
    F-Measure: 95.45%
Rape
    Precision: 98.44%
    Recall: 96.92%
    F-Measure: 97.67%
Theft
    Precision: 96.15%
    Recall: 96.15%
    F-Measure: 96.15%
Carnapping
    Precision: 100.00%
    Recall: 100.00%
    F-Measure: 100.00%
Others
    Precision: 100.00%
    Recall: 80.85%
    F-Measure: 89.41%

Hamming Loss (validation): 0.0109375
Validation Loss: 0.00849377965554595
Validation Metrics: {'Murder': {'Precision': 1.0, 'Recall': 0.948051948051948, 'F1-Score': 0.9733333333333333}, 'Homicid

Epoch:  80%|████████  | 4/5 [1:24:44<21:09, 1269.14s/it]

Saved the best model!
Running epoch 5
(September 20, 2024 | 11:22 PM)
Epoch 5, Batch 0, Loss: 0.006825264543294907
Epoch 5, Batch 8, Loss: 0.01198379322886467
Epoch 5, Batch 16, Loss: 0.007889593951404095
Epoch 5, Batch 24, Loss: 0.008041823282837868
Epoch 5, Batch 32, Loss: 0.006579299923032522
Epoch 5, Batch 40, Loss: 0.00706342700868845
Epoch 5, Batch 48, Loss: 0.007917888462543488
Epoch 5, Batch 56, Loss: 0.007429757155478001
Epoch 5, Batch 64, Loss: 0.01062520407140255
Epoch 5, Batch 72, Loss: 0.012577639892697334
Epoch 5, Batch 80, Loss: 0.010702930390834808
Epoch 5, Batch 88, Loss: 0.006681032478809357
Epoch 5, Batch 96, Loss: 0.012507770210504532
Epoch 5, Batch 104, Loss: 0.010343157686293125
Epoch 5, Batch 112, Loss: 0.006243003532290459
Epoch 5, Batch 120, Loss: 0.006791519466787577
Epoch 5, Batch 128, Loss: 0.007858894765377045
Epoch 5, Batch 136, Loss: 0.011265676468610764
Epoch 5, Batch 144, Loss: 0.0122928312048316
Epoch 5, Batch 152, Loss: 0.016909189522266388


Epoch:  80%|████████  | 4/5 [1:35:17<23:49, 1429.41s/it]


KeyboardInterrupt: 

: 

In [None]:
# current_datetime = datetime.now().strftime("(%m-%d-%Y)_(%I-%M-%S-%p)")
# current_date = datetime.now().strftime("%B %d, %Y")
# save_dir = os.path.join("../saved_models", current_date)
# os.makedirs(save_dir, exist_ok=True)
# file_name = f"xlnet-{current_datetime}.pth"
# model_save_path = os.path.join(save_dir, file_name)
# torch.save(crimeclassifier.state_dict(), model_save_path)

# Custom loaded model

In [None]:
bertCrimeClassifier = BERTCrimeClassifier(model_name, len(LABELS))
bertCrimeClassifier.load_state_dict(torch.load('./models/September 18, 2024/xlnet-(09-18-2024)_(01-54-02-PM).pth'))

  crimeclassifier.load_state_dict(torch.load('./models/September 18, 2024/xlnet-(09-18-2024)_(01-54-02-PM).pth'))


<All keys matched successfully>

# Inference

In [None]:
loaded_model = BERTCrimeClassifier(model_name, len(LABELS))
test_sentence = "as the accused was dragged into the middle of the village the people start to circle around him then without warning someone launch an attack they rush forward grabbing the accused by the hair and slamming him into the ground the attack was relentless with punches and kicks coming from every direction the man tried to crawl away but he was surrounded there was nowhere for him to go he just lay there trying to protect himself from the attack while the crowd cheer on the violence it was horrifying to witnessas the accused was dragged into the middle of the village the people start to circle around him then without warning someone launch an attack they rush forward grabbing the accused by the hair and slamming him into the ground the attack was relentless with punches and kicks coming from every direction the man tried to crawl away but he was surrounded there was nowhere for him to go he just lay there trying to protect himself from the attack while the crowd cheer on the violence it was horrifying to witness"


In [None]:
loaded_model.load_state_dict(torch.load('./models/September 18, 2024/xlnet-(09-18-2024)_(02-48-13-PM).pth'))
#'../saved_models/Trained_Model-(11-16-2023)_(05-01-09-PM).pth'



  loaded_model.load_state_dict(torch.load('./models/September 18, 2024/xlnet-(09-18-2024)_(02-48-13-PM).pth'))


<All keys matched successfully>

In [None]:
def preprocess_text(text):
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors='pt')
    return encoding

# get actual labels
def get_actual_labels(index=0): 

    text = raw_dataset["test"]["Text"][index]
    print(text)

    labels = []
    for label in LABELS: 
        actual = raw_dataset["test"][label][index]
        
        if actual == 1:
           labels.append(label)

    # print([class_name for class_name in LABELS if raw_dataset["test"][class_name][index] == 1])
    # print(labels)

    return text


def inference(model, test_dataset, index=0):

    test_sentence =  get_actual_labels(index)

    encoded_test_sentence = preprocess_text(test_sentence)
    model.eval()
    with torch.no_grad():
        logits = model(ids=encoded_test_sentence['input_ids'], mask=encoded_test_sentence['attention_mask'])

    probabilities = logits.flatten().sigmoid()
    threshold = 0.5

    predicted_labels = [(label, f"{pred*100:.2f}%") for label, pred in zip(LABELS, probabilities)  if pred >= threshold] # 

    print("Predictions: ")
    print(predicted_labels)

inference(loaded_model, raw_dataset["test"]["Text"][0])

i had just stepped out of my house when i saw a car speeding down the street it swerved and hit a pedestrian i think the driver didn't see him it was a mistake after the crash the driver got out and saw a woman walking nearby he started harassing her grabbing at her clothes she looked terrified
Predictions: 
[('Homicide', '68.31%')]


In [None]:
def get_predictions(test_sentence):

    encoded_test_sentence = preprocess_text(test_sentence)

    with torch.no_grad():
        logits = trained_model(ids=encoded_test_sentence['input_ids'], mask=encoded_test_sentence['attention_mask'])

    predictions = logits.flatten().sigmoid()  # Apply sigmoid to get probabilities

    # Get all labels
    label_probabilities = [{"name": label, "probability": f"{prob * 100:.2f}%"} for label, prob in zip(LABELS, predictions)]

    # Sort label probabilities in descending order
    label_probabilities = sorted(label_probabilities, key=lambda item: -float(item["probability"][:-1]))
    print(label_probabilities)

    threshold = 0.5

    # Labels greater than 0.5 threshold
    predicted_labels = [(label, f"{pred*100:.2f}%") for label, pred in zip(LABELS, predictions) if pred >= threshold]
    print("Input:", test_sentence)
    print("Probabilities: ", label_probabilities)

    print("Labels:")
    for label, probability in predicted_labels:
        print(f"({label}, {probability})")


    return label_probabilities

# Testing

In [None]:
test_dataset = raw_dataset['test']
test_dataset

Unnamed: 0,ID,Text,Murder,Homicide,Robbery,Physical Injuries,Rape,Theft,Carnapping,Others
0,977,i had just stepped out of my house when i saw ...,0,1,0,0,1,0,0,0
1,2017,she was staying at a friends house when the fr...,0,0,0,0,1,0,0,0
2,45,"in mexico 2010, my dad shot and killed a man w...",1,0,0,0,0,0,0,0
3,998,he told me he didn’t mean for it to go that fa...,0,1,0,0,1,0,0,0
4,3285,at bondi beach i parked my car on a side stree...,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
1195,1323,"as vendors packed up for the day, i heard a sh...",0,0,1,0,0,0,0,0
1196,3237,during a long conference I take a break to loo...,0,0,0,0,0,0,1,0
1197,3516,i was almost kidnapped in jamaica i was six ye...,0,0,0,0,0,0,0,1
1198,1883,i was about to go to bed when i heard a loud n...,0,0,1,1,0,0,0,0


In [None]:
def encode_data(data):
    text = data["Text"]

    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=128,
        return_token_type_ids=False,
        return_attention_mask=True
    )

    labels = [data[label] for label in LABELS]

    representation = {
        'input_ids': encoding['input_ids'],
        'attention_mask': encoding['attention_mask'],
        'labels': torch.FloatTensor(labels)
    }

    return representation

In [None]:
def get_cm_eval(y_pred, y_true): # TN, FP, FN, TP

    cm_eval = []

    for idx, label in enumerate(y_pred):
         
        if   y_pred[idx] == 0 and y_true[idx] == 0:
            cm_eval.append("TN") 
        elif y_pred[idx] == 1 and y_true[idx] == 0:
            cm_eval.append("FP") 
        elif y_pred[idx] == 0 and y_true[idx] == 1:
            cm_eval.append("FN") 
        elif y_pred[idx] == 1 and y_true[idx] == 1:
            cm_eval.append("TP") 

    return cm_eval
    

def test_model(model, test_dataset, labels=LABELS):

    model.eval()
    threshold = 0.5

    # initialize dataframe for each labels
    header_row = ['ID', 'Text', 'Actual Labels', 'Predicted Labels', 'Evaluation']
    df_set = {}
    for label in labels:
        df_set[label] = pd.DataFrame(columns=header_row) 

    # get rows
    for input_id, data in test_dataset.iterrows():

            input_sentence = data['Text']
            encoded_data = encode_data(data)

            # get true labels
            true_labels = encoded_data['labels']
            y_true = np.zeros(true_labels.shape)
            y_true[np.where(true_labels >= threshold)] = 1

            # get predictions
            with torch.no_grad():
                logits = model(ids=encoded_data['input_ids'], mask=encoded_data['attention_mask'])

            probabilities = logits.flatten().sigmoid()
            y_pred = np.zeros(probabilities.shape)
            y_pred[np.where(probabilities >= threshold)] = 1

             # get evaluation
            for idx, label in enumerate(labels):

                df_row = [input_id, input_sentence]

                if   y_pred[idx] == 0 and y_true[idx] == 0:
                    y_eval = "TN" 
                elif y_pred[idx] == 1 and y_true[idx] == 0:
                    y_eval = "FP" 
                elif y_pred[idx] == 0 and y_true[idx] == 1:
                    y_eval = "FN" 
                elif y_pred[idx] == 1 and y_true[idx] == 1:
                    y_eval = "TP"  

                # df_row.append(y_true[idx].astype(int))
                # df_row.append(y_pred[idx].astype(int))

                # Convert numpy int64 to Python int
                df_row.append(int(y_true[idx]))
                df_row.append(int(y_pred[idx]))
                df_row.append(y_eval)

                print(df_row)
                df_set[label].loc[len(df_set[label])] = df_row

            print()

    return df_set

# save dataframe to csv for experiment 1
df_set = test_model(loaded_model, test_dataset)

[0, "i had just stepped out of my house when i saw a car speeding down the street it swerved and hit a pedestrian i think the driver didn't see him it was a mistake after the crash the driver got out and saw a woman walking nearby he started harassing her grabbing at her clothes she looked terrified", 0, 0, 'TN']
[0, "i had just stepped out of my house when i saw a car speeding down the street it swerved and hit a pedestrian i think the driver didn't see him it was a mistake after the crash the driver got out and saw a woman walking nearby he started harassing her grabbing at her clothes she looked terrified", 1, 1, 'TP']
[0, "i had just stepped out of my house when i saw a car speeding down the street it swerved and hit a pedestrian i think the driver didn't see him it was a mistake after the crash the driver got out and saw a woman walking nearby he started harassing her grabbing at her clothes she looked terrified", 0, 0, 'TN']
[0, "i had just stepped out of my house when i saw a ca

In [None]:
def save_to_csv(df, save_dir, file_name):
    os.makedirs(save_dir, exist_ok=True)
    csv_save_path = os.path.join(save_dir, file_name)
    df.to_csv(csv_save_path, index=False)


# Experiment Paper 1
save_dir = "../reports/xlnet-1/"

def exp1(df_set):
    # save dataframes for each labels to csv      
    save_directory = save_dir + "/Experiment 1"
        
    for label in df_set:
        file_name = f"{label}.csv"
        save_to_csv(df_set[label], save_directory, file_name)

In [None]:
exp1(df_set=df_set)

NameError: name 'df_set' is not defined

In [None]:
def get_metrics(eval_count: dict):
    TP = eval_count['TP'] # True Positives
    FP = eval_count['FP']  # False Positives
    FN = eval_count['FN']  # False Negatives
    TN = eval_count['TN']  # True Negatives

    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0

    return precision, recall, f1_score

# Experiment Paper 2
def exp2(df_set, labels=LABELS):
    
    header_row = ['Label', 'True Positives (TP)', 'True Negatives (TN)', 'False Positives (FP)', 'False Negatives (FN)', 'Precision', 'Recall', 'F-measure']
    exp2_df = pd.DataFrame(columns=header_row)

    for idx, label in enumerate(labels):

        # evaluation count for each label
        eval_count = {
            'TP' : 0,
            'TN' : 0,
            'FP' : 0,
            'FN' : 0
        }

        label_eval_column = df_set[label]['Evaluation']

        for eval in label_eval_column:
            eval_count[eval] += 1
        
        precision, recall, f1_score = get_metrics(eval_count) 

        exp2_df.loc[len(exp2_df)] = [
            label, 
            eval_count['TP'],
            eval_count['TN'],
            eval_count['FP'],
            eval_count['FN'],
            precision,
            recall,
            f1_score
        ]
    
    return exp2_df

exp2_df = exp2(df_set, labels=LABELS)
exp2_df

Unnamed: 0,Label,True Positives (TP),True Negatives (TN),False Positives (FP),False Negatives (FN),Precision,Recall,F-measure
0,Murder,147,1019,2,32,0.986577,0.821229,0.896341
1,Homicide,155,1000,21,24,0.880682,0.865922,0.873239
2,Robbery,182,976,15,27,0.923858,0.870813,0.896552
3,Physical Injuries,168,1003,16,13,0.913043,0.928177,0.920548
4,Rape,195,991,9,5,0.955882,0.975,0.965347
5,Theft,156,1039,3,2,0.981132,0.987342,0.984227
6,Carnapping,146,1044,8,2,0.948052,0.986486,0.966887
7,Others,105,1047,0,48,1.0,0.686275,0.813953


In [None]:
save_to_csv(df=exp2_df, save_dir=save_dir, file_name='Experiment Paper 2.csv')

NameError: name 'save_to_csv' is not defined

In [None]:
def get_cm_eval(y_pred, y_true): # TN, FP, FN, TP

    cm_eval = []

    for idx, label in enumerate(y_pred):
         
        if   y_pred[idx] == 0 and y_true[idx] == 0:
            cm_eval.append("TN") 
        elif y_pred[idx] == 1 and y_true[idx] == 0:
            cm_eval.append("FP") 
        elif y_pred[idx] == 0 and y_true[idx] == 1:
            cm_eval.append("FN") 
        elif y_pred[idx] == 1 and y_true[idx] == 1:
            cm_eval.append("TP") 

    return cm_eval
    

def test_model(model, test_dataset):

    # preprocess(test_dataset['Text'])

    header_row = ['ID', 'Text']

    for index, data in test_dataset.iterrows():


            model.eval()
            threshold = 0.5

            input_sentence = data['Text']
            encoded_data = encode_data(data)

            # get true labels
            true_labels = encoded_data['labels']
            y_true = np.zeros(true_labels.shape)
            y_true[np.where(true_labels >= threshold)] = 1

            # get predictions
            with torch.no_grad():
                logits = model(ids=encoded_data['input_ids'], mask=encoded_data['attention_mask'])

            probabilities = logits.flatten().sigmoid()
            y_pred = np.zeros(probabilities.shape)
            y_pred[np.where(probabilities >= threshold)] = 1

            # get evaluation
            y_eval = get_cm_eval(y_pred, y_true)

            # predicted_labels = [(label, f"{pred*100:.2f}%") for label, pred in zip(LABELS, probabilities) if pred > threshold] # 
            # true_labels = [(label, f"{pred*100:.2f}%") for label, pred in zip(LABELS, true_labels) if pred > threshold] # 

            row = [index, input_sentence] + y_eval

            print(f"{row}")
            # print(f"True: {y_true}")
            # print(f"Pred: {y_pred}")
            # print(f"Eval: {y_eval}")
            # print()

        # else:
        #     continue

test_model(loaded_model, test_dataset)

[0, "i had just stepped out of my house when i saw a car speeding down the street it swerved and hit a pedestrian i think the driver didn't see him it was a mistake after the crash the driver got out and saw a woman walking nearby he started harassing her grabbing at her clothes she looked terrified", 'TN', 'TP', 'TN', 'TN', 'FN', 'TN', 'TN', 'TN']
[1, 'she was staying at a friends house when the friends father entered her room late at night pretending to check on her safety but his intentions were far darker he molested her while she lay frozen in fear afraid to scream as the house was filled with his family', 'TN', 'TN', 'TN', 'TN', 'TP', 'TN', 'TN', 'TN']
[2, "in mexico 2010, my dad shot and killed a man who chased my uncle with a knife, the man was my aunt's brother, and my dad fled the country due to the unregistered gun, it was a premeditated act that tore our family apart, but we’re slowly rebuilding", 'TP', 'TN', 'TN', 'TN', 'TN', 'TN', 'TN', 'TN']
[3, 'he told me he didn’t mea

In [None]:
eval_model(loaded_model, test_dataloader) # testing dataset

100%|██████████| 150/150 [04:35<00:00,  1.84s/it]


Confusion Matrix
[[[1019    2]
  [  32  147]]

 [[1000   21]
  [  24  155]]

 [[ 976   15]
  [  27  182]]

 [[1003   16]
  [  13  168]]

 [[ 991    9]
  [   5  195]]

 [[1039    3]
  [   2  156]]

 [[1044    8]
  [   2  146]]

 [[1047    0]
  [  48  105]]]
Murder
    Precision: 98.66%
    Recall: 82.12%
    F-Measure: 89.63%
Homicide
    Precision: 88.07%
    Recall: 86.59%
    F-Measure: 87.32%
Robbery
    Precision: 92.39%
    Recall: 87.08%
    F-Measure: 89.66%
Physical Injuries
    Precision: 91.30%
    Recall: 92.82%
    F-Measure: 92.05%
Rape
    Precision: 95.59%
    Recall: 97.50%
    F-Measure: 96.53%
Theft
    Precision: 98.11%
    Recall: 98.73%
    F-Measure: 98.42%
Carnapping
    Precision: 94.81%
    Recall: 98.65%
    F-Measure: 96.69%
Others
    Precision: 100.00%
    Recall: 68.63%
    F-Measure: 81.40%

Hamming Loss (validation): 0.023645833333333335


(0.009592493305293222,
 {'Murder': {'Precision': 0.9865771812080537,
   'Recall': 0.8212290502793296,
   'F1-Score': 0.8963414634146342},
  'Homicide': {'Precision': 0.8806818181818182,
   'Recall': 0.8659217877094972,
   'F1-Score': 0.8732394366197184},
  'Robbery': {'Precision': 0.9238578680203046,
   'Recall': 0.8708133971291866,
   'F1-Score': 0.896551724137931},
  'Physical Injuries': {'Precision': 0.9130434782608695,
   'Recall': 0.9281767955801105,
   'F1-Score': 0.9205479452054794},
  'Rape': {'Precision': 0.9558823529411765,
   'Recall': 0.975,
   'F1-Score': 0.9653465346534653},
  'Theft': {'Precision': 0.9811320754716981,
   'Recall': 0.9873417721518988,
   'F1-Score': 0.9842271293375394},
  'Carnapping': {'Precision': 0.948051948051948,
   'Recall': 0.9864864864864865,
   'F1-Score': 0.9668874172185431},
  'Others': {'Precision': 1.0,
   'Recall': 0.6862745098039216,
   'F1-Score': 0.813953488372093},
  'hamming_loss': np.float64(0.023645833333333335)})