## Smishing detection using [NousResearch/Llama-2-13b-chat-hf](https://huggingface.co/NousResearch/Llama-2-13b-chat-hf) - evaluation of the model

In [None]:
from transformers import LlamaTokenizer, LlamaForCausalLM, pipeline
import pickle

In [None]:
# test data
with open("./../data/test_data.pkl", "rb") as input_file:
    test_data = pickle.load(input_file)

X_test = test_data["X_test"]
y_test = test_data["y_test"]

total_hams_count = 0
total_smishes_count = 0

for label in y_test:
    if label == "ham":
        total_hams_count += 1
    if label == "smish":
        total_smishes_count += 1

print("There is {} hams and {} smishes in the test dataset.".format(total_hams_count, total_smishes_count))

In [None]:
# Model name
model_name = 'NousResearch/Llama-2-13b-chat-hf'

# Load the model
model = LlamaForCausalLM.from_pretrained(model_name)

# Load the tokenizer
tokenizer = LlamaTokenizer.from_pretrained(model_name)

In [None]:
false_hams_indicies = []
false_smishes_indicies = []
false_hams_count = 0
false_smishes_count = 0
true_hams_count = 0
true_smishes_count = 0
errors_count = 0
errors_indicies = []
errors = []


for i in range(len(X_test)):
    prompt = (
    "<s>[INST] Do you think it is a ham or smish message? "
    "Your output should be a single word 'smish' or 'ham'. "
    "Do not write a sentence. "
    "Output is case-sensitive. "
    "SMS content: {}[/INST]"
    ).format(X_test[i])

    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
    answer = pipe(prompt)

    if answer not in ["ham", "smish"]:
        errors_count += 1
        errors_indicies.append(i)
        errors.append(answer)
        continue
    elif answer == "ham" and y_test[i] == "ham": # correctly recognized as a ham
        true_hams_count += 1
    elif answer == "smish" and y_test[i] == "smish": # correctly recognized as a smish
        true_smishes_count += 1
    elif answer == "ham" and y_test[i] == "smish": # wrongly recognized as a ham
        false_hams_indicies.append(i)
        false_hams_count += 1
    elif answer == "smish" and y_test[i] == "ham": # wrongly recognized as a smish
        false_smishes_indicies.append(i)
        false_smishes_count += 1
        
# errors warning   
if errors_count != 0:
    if errors_count == 1:
        print("WARNING: {} error".format(errors_count))
    else:
        print("WARNING: {} errors".format(errors_count))

# save results for further analysis
results = {"FN" : false_hams_count, "FP" : false_smishes_count, 
           "TN" : true_hams_count, "TP" : true_smishes_count,
           "FN_indicies" : false_hams_indicies, "FP_indicies" : false_smishes_indicies,
            "errors_count" : errors_count, "errors" : errors, "errors_indicies" : errors_indicies}

with open("./results/results_llama_2_13b.pkl", 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

### The performance of our fine-tuned Llama 2 7b in smishing detection.

In [None]:
FN = false_hams_count     #FN - messages wrongly recognized as not smishes (hams)
FP = false_smishes_count  #FP - messages wrongly recognized as smishes
TN = true_hams_count      #TN - messages correctly recognized as not smishes (hams)
TP = true_smishes_count   #TP - messages correctly recognized as smishes
TOTAL = FN + FP + TN + TP

In [None]:
print("Messages wrongly recognized as hams: {0:.2f}%".format(FN / TOTAL * 100))
print("Messages wrongly recognized as smishes: {0:.2f}%".format(FP / TOTAL * 100))
print("Messages correctly recognized as hams: {0:.2f}%".format(TN / TOTAL * 100))
print("Messages correctly recognized as smishes: {0:.2f}%".format(TP / TOTAL * 100))

#### Accuracy

In [None]:
accuracy = (TP + TN) / TOTAL
print("{0:.2f}%".format(accuracy * 100))

#### Recall

In [None]:
recall = TP / (TP + FN)
print("{0:.2f}%".format(recall * 100))

#### Precision

In [None]:
precision = TP / (TP + FP)
print("{0:.2f}%".format(precision * 100))

#### F1 score

In [None]:
F1_score = TP / (TP + (FP + FN) / 2)
print("{0:.2f}%".format(F1_score * 100))