In [2]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import time  
import random

dataset_dir = "./data/train_test_val"

DATASET = {
    'train': pd.read_csv(dataset_dir + '/train.csv').reset_index(drop=True),  # """ encoding='cp1252' """ insert between train_data.csv and .reset index as parameter
    'test': pd.read_csv(dataset_dir + '/test.csv').reset_index(drop=True),  # """ encoding='cp1252' """ insert between train_data.csv and .reset index as parameter
    'val': pd.read_csv(dataset_dir + '/val.csv').reset_index(drop=True),  # """ encoding='cp1252' """ insert between train_data.csv and .reset index as parameter
}

MODEL_NAMES = {
    "bert": 'google-bert/bert-base-uncased',
    "xlnet": 'xlnet/xlnet-base-cased',
}

MODEL_VARIANTS = {
    "bert-pretrained": 'cc-bert-pretrained-model.pth',
    "xlnet-pretrained": 'cc-xlnet-pretrained-model.pth',
    "bert-finetuned": 'cc-bert-finetuned-model.pth',
    "xlnet-finetuned": 'cc-xlnet-finetuned-model.pth',
}

MODEL_DIR = "./models" 

LABELS = [

    'Murder',
    'Homicide',
    'Robbery',
    'Physical Injuries',
    'Rape',
    'Theft',
    'Carnapping',
    'Others'
]

THRESHOLD = 0.5

class BERTCrimeClassifier(nn.Module):
    def __init__(self, model_name, batch_size=8, epochs=5, dropout=0.1):
        super(BERTCrimeClassifier, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.hidden_linear = nn.Linear(self.model.config.hidden_size, self.model.config.hidden_size) 
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(self.model.config.hidden_size, len(LABELS))

    def forward(self, ids, mask):
        bert_outputs = self.model(ids, attention_mask=mask)
        cls_hidden_state = bert_outputs.last_hidden_state[:, 0, :] 
        hidden_output = self.hidden_linear(cls_hidden_state) 
        dropped_out = self.dropout(hidden_output)  
        logits = self.linear(dropped_out)  
        return logits


class XLNetCrimeClassifier(nn.Module):
    def __init__(self, model_name, sbatch_size=8,epochs=5, dropout=0.1): 
        super(XLNetCrimeClassifier, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(self.model.config.hidden_size, len(LABELS))

    def forward(self, ids, mask):
        bert_outputs = self.model(ids, attention_mask=mask)
        cls_hidden_state = bert_outputs.last_hidden_state[:, 0, :] 
        dropped_out = self.dropout(cls_hidden_state)
        logits = self.linear(dropped_out)
        return logits



bertCrimeClassifier = BERTCrimeClassifier('google-bert/bert-base-uncased')
xlnetCrimeClassifier = XLNetCrimeClassifier('xlnet/xlnet-base-cased')

print(bertCrimeClassifier)
print(xlnetCrimeClassifier)


# Global cache para i-store ang mga loaded na models
model_cache = {}

def get_model(model_id, model_variant):
    model_name = MODEL_NAMES[model_id]

    cache_key = f"{model_id}-{model_variant}"

    if cache_key in model_cache:
        print(f"Using cached model: {cache_key}")
        return model_cache[cache_key]
    
    if model_id == "bert":
        crimeClassifier = BERTCrimeClassifier(model_name)
    elif model_id == "xlnet": 
        crimeClassifier = XLNetCrimeClassifier(model_name)

    # Load pre-trained weights
    model_path = f'{MODEL_DIR}/{model_variant}/{MODEL_VARIANTS[model_variant]}'
    crimeClassifier.load_state_dict(torch.load(model_path))

    crimeClassifier.eval()

    model_cache[cache_key] = crimeClassifier

    print(f"Model loaded and cached: {cache_key}")
    return crimeClassifier

def get_predictions(input_text, model_id, model_variant):

    crimeClassifier = get_model(model_id, model_variant)

    # Tokenizer
    model_name = MODEL_NAMES[model_id]
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Classification
    start_time = time.time()  # Start the timer

    # Encode text
    encoded_input_text = tokenizer(input_text, padding="max_length", truncation=True, max_length=128, return_tensors='pt')

    # Get raw results
    with torch.no_grad():
        logits = crimeClassifier(ids=encoded_input_text['input_ids'], mask=encoded_input_text['attention_mask'])

    # Apply activation to get probabilities
    predictions = logits.flatten().sigmoid()

    label_probabilities = [{"label": label, "probability": float(round(prob.item() * 100, 2))} for label, prob in zip(LABELS, predictions)]

    # Sort label probabilities in descending order
    label_probabilities = sorted(label_probabilities, key=lambda item: -item["probability"])

    # Labels greater than 0.5 threshold
    predicted_labels = [(label, f"{pred*100:.2f}%") for label, pred in zip(LABELS, predictions) if pred >= THRESHOLD]
    
    end_time = time.time()  # End the timer
    duration = round(end_time - start_time, 4)  # Calculate the duration


    # Display results
   
    print("Input: " + input_text)
    print("Index: " + str(index))
    # get_actual_labels(index)
    print()
    print("Predicted Labels:")
    for label, probability in predicted_labels:
        print(f"({label}, {probability})")
    print()
    for result in label_probabilities: 
        print(f"{result['label']}: {result['probability']}")

    print(f"\nPrediction processing time: {duration:.4f} seconds")

    return label_probabilities, duration   # Return both the predictions and the processing time


def get_actual_labels(index=-1): 

    text = DATASET["test"]["Text"][index]

    labels = []
    for label in LABELS: 
        actual = DATASET["test"][label][index]
        
        if actual == 1:
           labels.append(label)

    print("Actual labels:")
    print([class_name for class_name in LABELS if DATASET["test"][class_name][index] == 1])
    # print(labels)

    return text




  from .autonotebook import tqdm as notebook_tqdm


# Inference


  from .autonotebook import tqdm as notebook_tqdm


In [32]:
# index = 64 # Palitan ang index (from 0 - 1199) kung gusto niyo ng ibang example 
index = random.randint(0, 1199) # or get a random number 
EXAMPLE_INPUT = DATASET['test']['Text'][index] 
# get_actual_labels(index)

# or i-uncomment out niyo eto kung gusto niyo magtest ng sariling example
# EXAMPLE_INPUT = "oh no a girl was found harassed by an old man"
EXAMPLE_INPUT = "The neighborhood has seen an increase in minor offenses like vandalism and noise complaints, but nothing as serious as the other crimes reported."
# print(len(EXAMPLE_INPUT))
# 80 - 600 characters

xlnet_predictions = get_predictions(EXAMPLE_INPUT, "xlnet", "xlnet-finetuned")

Using cached model: xlnet-xlnet-finetuned
Input: after a massive outdoor concert the sky opened up pouring rain on the sea of attendees scrambling to find their cars amid the chaos and umbrellas i found my parking spot empty i had parked near the back entrance thinking it would be easier to leave quickly the rain soaked lot was a mess and i realized too late that the car which i had parked next to a large van was gone the van was still there blocking the view from any nearby cameras making it easy for auto burglary to work unnoticed
Index: 63

Predicted Labels:
(Carnapping, 99.95%)

Carnapping: 99.95
Theft: 0.9
Robbery: 0.41
Physical Injuries: 0.11
Homicide: 0.09
Others: 0.03
Murder: 0.01
Rape: 0.01

Prediction processing time: 0.5444 seconds


In [1]:
bert_predictions = get_predictions(EXAMPLE_INPUT, "bert", "bert-finetuned")

NameError: name 'get_predictions' is not defined

# Examples

Murder

    "A person was found guilty after a long investigation into the brutal killing of their neighbor. The court sentenced them to life in prison for the cold-blooded murder."

Homicide

    "The local authorities are investigating a suspected homicide after a body was discovered in an abandoned building with multiple injuries suggesting foul play."

Robbery

    "The store owner was terrified when two masked individuals entered the shop, brandishing weapons and demanding money. It was a frightening robbery that lasted only a few minutes."

Physical Injuries

    "During the bar fight, several people were injured, with one person suffering from a broken arm and another with severe cuts. The police are looking for the attackers responsible for the physical injuries."

Rape

    "The victim bravely came forward to report the incident, describing how they were assaulted in a parking lot. The accused has been charged with rape and is currently in police custody."

Theft

    "Someone broke into the car last night and stole the stereo system and other valuables. The theft was reported to the police, who are now investigating the incident."

Carnapping

    "The owner was devastated when they found out their car was missing from the parking lot. The police suspect it is a case of carnapping, as there have been similar incidents in the area."

Others (non-index)

    "The neighborhood has seen an increase in minor offenses like vandalism and noise complaints, but nothing as serious as the other crimes reported."

In [26]:
from transformers import AutoModelForSequenceClassification

# Load the best model
best_model = AutoModelForSequenceClassification.from_pretrained("./models/bert-classifier-wrapper" + "/checkpoint-1500")



# Tokenizer
model_name = MODEL_NAMES["bert"]
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Classification
start_time = time.time()  # Start the timer

# Encode text
encoded_input_text = tokenizer(EXAMPLE_INPUT, padding="max_length", truncation=True, max_length=128, return_tensors='pt')

# Get raw results
with torch.no_grad():
    output = best_model(input_ids=encoded_input_text['input_ids'], 
                        attention_mask=encoded_input_text['attention_mask'])
    logits = output.logits  # Extract logits from the output

# Apply activation to get probabilities
predictions = logits.sigmoid() 
predictions = predictions[0]

label_probabilities = [{"label": label, "probability": float(round(prob.item() * 100, 2))} for label, prob in zip(LABELS, predictions)]

# Sort label probabilities in descending order
label_probabilities = sorted(label_probabilities, key=lambda item: -item["probability"])

# Labels greater than 0.5 threshold
predicted_labels = [(label, f"{pred*100:.2f}%") for label, pred in zip(LABELS, predictions) if pred >= THRESHOLD]

end_time = time.time()  # End the timer
duration = round(end_time - start_time, 4)  # Calculate the duration


# Display results

print("Input: " + EXAMPLE_INPUT)
print("Index: " + str(index))
get_actual_labels(index)
print()
print("Predicted Labels:")
for label, probability in predicted_labels:
    print(f"({label}, {probability})")
print()
for result in label_probabilities: 
    print(f"{result['label']}: {result['probability']}")

print(f"\nPrediction processing time: {duration:.4f} seconds")

Input: Someone broke into the car last night and stole the stereo system and other valuables. The theft was reported to the police, who are now investigating the incident.
Index: 561
Actual labels:
['Murder']

Predicted Labels:

Others: 37.04
Carnapping: 6.08
Theft: 4.31
Robbery: 3.04
Homicide: 0.63
Murder: 0.57
Physical Injuries: 0.54
Rape: 0.52

Prediction processing time: 0.1393 seconds
