## Smishing detection using [NousResearch/Llama-2-13b-chat-hf](https://huggingface.co/NousResearch/Llama-2-13b-chat-hf) - evaluation of the model

In [2]:
%%capture
%pip install accelerate peft bitsandbytes transformers trl wandb
import os
import torch
import numpy
import pickle
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer
from transformers import LlamaTokenizer, LlamaForCausalLM, pipeline

In [3]:
# test data
with open("./data/test_data.pkl", "rb") as input_file:
    test_data = pickle.load(input_file)

X_test = test_data["X_test"]
y_test = test_data["y_test"]

total_hams_count = 0
total_smishes_count = 0

for label in y_test:
    if label == "ham":
        total_hams_count += 1
    if label == "smish":
        total_smishes_count += 1

print("There is {} hams and {} smishes in the test dataset.".format(total_hams_count, total_smishes_count))

There is 954 hams and 161 smishes in the test dataset.


In [4]:
# Model name
model_name = 'NousResearch/Llama-2-13b-chat-hf'

# Load the model
model = LlamaForCausalLM.from_pretrained(model_name)

# Load the tokenizer
tokenizer = LlamaTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/196 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [5]:
false_hams_indicies = []
false_smishes_indicies = []
false_hams_count = 0
false_smishes_count = 0
true_hams_count = 0
true_smishes_count = 0
errors_count = 0
errors_indicies = []
errors = []


for i in range(len(X_test)):
    prompt = (
    "<s>[INST] Do you think it is a malicious or benign message? "
    "Your output should be a single word 'malicious' or 'benign'. "
    "Do not write a sentence. "
    "Output is case-sensitive. "
    "SMS content: {}[/INST]"
    ).format(X_test[i])

    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=500)
    answer = pipe(prompt)
    answer = answer[0]['generated_text'].split("[/INST]")[1].split("</s>")[0].lower().strip()
    if answer == "malicious": answer = "smish"
    elif answer == "benign": answer = "ham"

    if answer not in ["ham", "smish"]:
        errors_count += 1
        errors_indicies.append(i)
        errors.append(answer)
        continue
    elif answer == "ham" and y_test[i] == "ham": # correctly recognized as a ham
        true_hams_count += 1
    elif answer == "smish" and y_test[i] == "smish": # correctly recognized as a smish
        true_smishes_count += 1
    elif answer == "ham" and y_test[i] == "smish": # wrongly recognized as a ham
        false_hams_indicies.append(i)
        false_hams_count += 1
    elif answer == "smish" and y_test[i] == "ham": # wrongly recognized as a smish
        false_smishes_indicies.append(i)
        false_smishes_count += 1

# errors warning
if errors_count != 0:
    if errors_count == 1:
        print("WARNING: {} error".format(errors_count))
    else:
        print("WARNING: {} errors".format(errors_count))

# save results for further analysis
results = {"FN" : false_hams_count, "FP" : false_smishes_count,
           "TN" : true_hams_count, "TP" : true_smishes_count,
           "FN_indicies" : false_hams_indicies, "FP_indicies" : false_smishes_indicies,
            "errors_count" : errors_count, "errors" : errors, "errors_indicies" : errors_indicies}

with open("./results/results_llama_2_13b.pkl", 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)



### The performance of the base Llama 2 13b model in smishing detection.

In [1]:
import pickle
with open("./results/results_llama_2_13b.pkl", "rb") as input_file:
    results = pickle.load(input_file)

FN = results['FN']  #FN - messages wrongly recognized as not smishes (hams)
FP = results['FP']  #FP - messages wrongly recognized as smishes
TN = results['TN']  #TN - messages correctly recognized as not smishes (hams)
TP = results['TP']  #TP - messages correctly recognized as smishes
ERRORS = results['errors_count']  #ERRORS - non-acceptable answers
TOTAL = FN + FP + TN + TP
    
TP = results['TP']
TN = results['TN']
FP = results['FP']
FN = results['FN']
ERRORS = results['errors_count']
TOTAL = FN + FP + TN + TP

print("TP: {}, TN: {}, FP: {}, FN: {}, ERRORS: {}".format(TP,TN,FP,FN,ERRORS))

accuracy_acc = ((TP + TN) / TOTAL)
accuracy_gen = ((TP + TN) / TOTAL) * (TOTAL / (TOTAL + ERRORS))
print("accuracy of acceptable answers only: {0:.2f}%, general accuracy: {1:.2f}%".format(accuracy_acc * 100, accuracy_gen * 100))

F1_score_acc = TP / (TP + (FP + FN) / 2)
F1_score_gen = TP / (TP + (FP + FN) / 2) * (TOTAL / (TOTAL + ERRORS))
print("F1 score of acceptable answers only: {0:.2f}%, general F1 score: {1:.2f}%".format(F1_score_acc * 100, F1_score_gen * 100))

TP: 26, TN: 228, FP: 155, FN: 3, ERRORS: 703
accuracy of acceptable answers only: 61.65%, general accuracy: 22.78%
F1 score of acceptable answers only: 24.76%, general F1 score: 9.15%
